[PATCH OLK-5.10 V1] sched: More flexible use of CPU quota when CPU is idle

20 Jun 2025

From: Zheng Zucheng <zhengzucheng@huawei.com>

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ICE7WC

--------------------------------

This feature allows users to use CPU quota more flexibly when CPU is
idle and it will cause the CPU quota to be exceeded. So, it cannot be
used in scenarios where there are strict restrictions on the use of the
CPU quota, such as some commercial scenarios that charge based on the
use of CPU quota.

Signed-off-by: Zheng Zucheng <zhengzucheng@huawei.com>
Signed-off-by: Liao Chang <liaochang1@huawei.com>
Signed-off-by: Cheng Yu <serein.chengyu@huawei.com>
---
 arch/arm64/Kconfig                     |   1 +
 arch/arm64/configs/openeuler_defconfig |   1 +
 arch/arm64/kernel/topology.c           |  32 ++++++
 include/linux/sched/sysctl.h           |   4 +
 init/Kconfig                           |  18 ++++
 kernel/sched/core.c                    |  32 ++++++
 kernel/sched/fair.c                    | 143 ++++++++++++++++++++++++-
 kernel/sched/idle.c                    |  13 +++
 kernel/sched/sched.h                   |  12 +++
 kernel/sysctl.c                        |  11 ++
 10 files changed, 263 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb30ef59aca2..4ba485650d0a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -80,6 +80,7 @@ config ARM64
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
 	select ARCH_SUPPORTS_SCHED_PARAL
+	select ARCH_SUPPORTS_SCHED_SOFT_QUOTA
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index be1faf2da008..1e1e70a6736d 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -191,6 +191,7 @@ CONFIG_NET_NS=y
 CONFIG_SCHED_STEAL=y
 CONFIG_SCHED_KEEP_ON_CORE=y
 CONFIG_SCHED_PARAL=y
+CONFIG_SCHED_SOFT_QUOTA=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
 # CONFIG_SYSFS_DEPRECATED is not set
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 785de5b9696d..b3ae5c6de81e 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -363,6 +363,38 @@ void topology_scale_freq_tick(void)
 	this_cpu_write(arch_const_cycles_prev, const_cnt);
 }
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static DEFINE_PER_CPU(int, sibling_idle) = 1;
+
+int is_sibling_idle(void)
+{
+	return this_cpu_read(sibling_idle);
+}
+
+static void smt_measurement_begin(void)
+{
+	// TODO
+}
+
+static void smt_measurement_done(void)
+{
+	// TODO
+}
+#else
+static inline void smt_measurement_begin(void) { }
+static inline void smt_measurement_done(void) { }
+#endif
+
+void arch_cpu_idle_enter(void)
+{
+	smt_measurement_begin();
+}
+
+void arch_cpu_idle_exit(void)
+{
+	smt_measurement_done();
+}
+
 #ifdef CONFIG_ACPI_CPPC_LIB
 #include <acpi/cppc_acpi.h>
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 9f998be56bdd..90021477ea4c 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,10 @@ extern unsigned int sysctl_smart_grid_strategy_ctrl;
 extern int sysctl_affinity_adjust_delay_ms;
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+extern unsigned int sysctl_soft_runtime_ratio;
+#endif
+
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
diff --git a/init/Kconfig b/init/Kconfig
index 5f88cce193e8..2ee50c638ca3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1411,6 +1411,24 @@ config SCHED_PARAL
 	  3. The existing "qos dynamic affinity" and "qos smart grid"
 	     features must not be used simultaneously.
 
+#
+# For architectures that want to enable the support for SCHED_SOFT_QUOTA
+#
+config ARCH_SUPPORTS_SCHED_SOFT_QUOTA
+	bool
+
+config SCHED_SOFT_QUOTA
+	bool "More flexible use of CPU quota"
+	depends on ARCH_SUPPORTS_SCHED_SOFT_QUOTA
+	depends on CFS_BANDWIDTH
+	default n
+	help
+	  This option allows users to use CPU quota more flexibly when CPU
+	  is idle. It is better for users to have some understanding of
+	  CFS_BANDWIDTH. It cannot be used in scenarios where there are strict
+	  restrictions on the use of the CPU quota, such as some commercial
+	  scenarios that charge based on the use of CPU quota.
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	select PROC_CHILDREN
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 457eeebc7b62..72cb2c1adb7b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9902,6 +9902,30 @@ static int cpu_steal_task_write(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static int cpu_soft_quota_write(struct cgroup_subsys_state *css,
+			 struct cftype *cftype, s64 soft_quota)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (soft_quota != 1 && soft_quota != 0)
+		return -EINVAL;
+
+	if (tg->soft_quota == soft_quota)
+		return 0;
+
+	tg->soft_quota = soft_quota;
+
+	return 0;
+}
+
+static inline s64 cpu_soft_quota_read(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	return css_tg(css)->soft_quota;
+}
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 void sched_settag(struct task_struct *tsk, s64 tag)
 {
@@ -10064,6 +10088,14 @@ static struct cftype cpu_legacy_files[] = {
 		.write_s64 = cpu_qos_write,
 	},
 #endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	{
+		.name = "soft_quota",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_soft_quota_read,
+		.write_s64 = cpu_soft_quota_write,
+	},
+#endif
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
 	{
 		.name = "smt_expell",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b7544a14225c..f601a6ea031e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -140,6 +140,10 @@ static int unthrottle_qos_cfs_rqs(int cpu);
 static bool qos_smt_expelled(int this_cpu);
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, soft_quota_throttled_cfs_rq);
+#endif
+
 #ifdef CONFIG_QOS_SCHED_MULTILEVEL
 #define QOS_LEVEL_WEIGHT_OFFLINE_EX	1
 #define QOS_LEVEL_WEIGHT_OFFLINE	10
@@ -439,10 +443,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
-static void
+static bool
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
 	int se_depth, pse_depth;
+	bool ret = false;
 
 	/*
 	 * preemption test can be made between sibling entities who are in the
@@ -456,6 +461,10 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (!ret && cfs_rq_of(*se)->soft_quota_enable == 1)
+			ret = true;
+#endif
 		se_depth--;
 		*se = parent_entity(*se);
 	}
@@ -466,9 +475,15 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	}
 
 	while (!is_same_group(*se, *pse)) {
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (!ret && cfs_rq_of(*se)->soft_quota_enable == 1)
+			ret = true;
+#endif
 		*se = parent_entity(*se);
 		*pse = parent_entity(*pse);
 	}
+
+	return ret;
 }
 
 #else	/* !CONFIG_FAIR_GROUP_SCHED */
@@ -503,9 +518,10 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return NULL;
 }
 
-static inline void
+static inline bool
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
+	return false;
 }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
@@ -5396,6 +5412,14 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	 */
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	if (cfs_rq->tg->soft_quota == 1) {
+		list_add(&cfs_rq->soft_quota_throttled_list,
+			 &per_cpu(soft_quota_throttled_cfs_rq, cpu_of(rq)));
+	}
+#endif
+
 	return true;
 }
 
@@ -5414,6 +5438,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	list_del_init(&cfs_rq->soft_quota_throttled_list);
+#endif
+
 #ifdef CONFIG_QOS_SCHED
 	/*
 	 * if this cfs_rq throttled by qos, not need unthrottle it.
@@ -5531,6 +5559,16 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		struct rq_flags rf;
 
 		rq_lock_irqsave(rq, &rf);
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (cfs_rq->soft_quota_enable == 1) {
+			if (cfs_rq->runtime_remaining > 0)
+				cfs_rq->runtime_remaining = 0;
+
+			cfs_rq->soft_quota_enable = 0;
+		}
+#endif
+
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
@@ -5573,6 +5611,17 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static inline void init_tg_sum_soft_runtime(struct cfs_bandwidth *cfs_b)
+{
+	unsigned int cpu;
+	struct task_group *tg = container_of(cfs_b, struct task_group, cfs_bandwidth);
+
+	for_each_possible_cpu(cpu)
+		tg->cfs_rq[cpu]->sum_soft_runtime = 0;
+}
+#endif
+
 /*
  * Responsible for refilling a task_group's bandwidth and unthrottling its
  * cfs_rqs as appropriate. If there has been no activity within the last
@@ -5590,6 +5639,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	cfs_b->nr_periods += overrun;
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	init_tg_sum_soft_runtime(cfs_b);
+#endif
+
 	/* Refill extra burst quota even if cfs_b->idle */
 	__refill_cfs_bandwidth_runtime(cfs_b);
 
@@ -5898,6 +5951,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED
 	INIT_LIST_HEAD(&cfs_rq->qos_throttled_list);
 #endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	INIT_LIST_HEAD(&cfs_rq->soft_quota_throttled_list);
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -8536,6 +8592,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 	int next_buddy_marked = 0;
+	bool ret = 0;
 
 	if (unlikely(se == pse))
 		return;
@@ -8590,7 +8647,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
 		return;
 
-	find_matching_se(&se, &pse);
+	ret = find_matching_se(&se, &pse);
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	if (ret)
+		goto preempt;
+#endif
+
 	update_curr(cfs_rq_of(se));
 	BUG_ON(!pse);
 	if (wakeup_preempt_entity(se, pse) == 1) {
@@ -13823,6 +13886,9 @@ static void task_change_group_fair(struct task_struct *p, int type)
 void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	struct cfs_rq *cfs_rq;
+#endif
 
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
 	destroy_auto_affinity(tg);
@@ -13831,6 +13897,12 @@ void free_fair_sched_group(struct task_group *tg)
 #ifdef CONFIG_QOS_SCHED
 		if (tg->cfs_rq && tg->cfs_rq[i])
 			unthrottle_qos_sched_group(tg->cfs_rq[i]);
+#endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (tg->cfs_rq && tg->cfs_rq[i]) {
+			cfs_rq = tg->cfs_rq[i];
+			list_del_init(&cfs_rq->soft_quota_throttled_list);
+		}
 #endif
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
@@ -14209,13 +14281,20 @@ void task_tick_relationship(struct rq *rq, struct task_struct *curr)
 
 __init void init_sched_fair_class(void)
 {
-#ifdef CONFIG_QOS_SCHED
+#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_SCHED_SOFT_QUOTA)
 	int i;
+#endif
 
+#ifdef CONFIG_QOS_SCHED
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i));
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(soft_quota_throttled_cfs_rq, i));
+#endif
+
 	init_sched_numa_icon();
 
 #ifdef CONFIG_SMP
@@ -14327,3 +14406,59 @@ int sched_trace_rq_nr_running(struct rq *rq)
         return rq ? rq->nr_running : -1;
 }
 EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+unsigned int sysctl_soft_runtime_ratio = 20;
+static bool check_soft_runtime(struct task_group *tg, int slice)
+{
+	int cpu;
+	u64 sum_soft_runtime = slice;
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+	if (cfs_b->quota == RUNTIME_INF)
+		return true;
+
+	for_each_possible_cpu(cpu)
+		sum_soft_runtime += tg->cfs_rq[cpu]->sum_soft_runtime;
+
+	return sum_soft_runtime < sysctl_soft_runtime_ratio * cfs_b->quota / 100;
+}
+
+bool unthrottle_cfs_rq_soft_quota(struct rq *rq)
+{
+	int max_cnt = 0;
+	bool ret = false;
+	struct cfs_rq *cfs_rq, *tmp_rq;
+	struct cfs_bandwidth *cfs_b;
+	int slice = sched_cfs_bandwidth_slice();
+
+	list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(soft_quota_throttled_cfs_rq, cpu_of(rq)),
+				 soft_quota_throttled_list) {
+		if (max_cnt++ > 20)
+			break;
+
+		if (cfs_rq->throttled) {
+			cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+			raw_spin_lock(&cfs_b->lock);
+
+			if (!check_soft_runtime(cfs_rq->tg, slice)) {
+				raw_spin_unlock(&cfs_b->lock);
+				continue;
+			}
+
+			raw_spin_unlock(&cfs_b->lock);
+
+			if (cfs_rq->runtime_remaining + slice > 0) {
+				cfs_rq->runtime_remaining += slice;
+				cfs_rq->sum_soft_runtime += slice;
+				cfs_rq->soft_quota_enable = 1;
+				unthrottle_cfs_rq(cfs_rq);
+				ret = true;
+				break;
+			}
+		}
+	}
+
+	return ret;
+}
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3c6396d61a04..4e5f603219b4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -439,10 +439,23 @@ static struct task_struct *pick_task_idle(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+int __weak is_sibling_idle(void)
+{
+	return 0;
+}
+#endif
+
 struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	struct task_struct *next = rq->idle;
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	if (unthrottle_cfs_rq_soft_quota(rq) && rq->cfs.nr_running &&
+		is_sibling_idle())
+		return pick_next_task_fair(rq, NULL, NULL);
+#endif
+
 	set_next_task_idle(rq, next, true);
 
 	return next;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fe6342305b0f..9b2779e8fc91 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -508,6 +508,9 @@ struct task_group {
 #else
 	KABI_RESERVE(4)
 #endif
+#if defined(CONFIG_SCHED_SOFT_QUOTA)
+	KABI_EXTEND(u64 soft_quota)
+#endif
 };
 
 #ifdef CONFIG_SCHED_STEAL
@@ -606,6 +609,10 @@ static inline int init_auto_affinity(struct task_group *tg)
 static inline void tg_update_affinity_domains(int cpu, int online) {}
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+extern bool unthrottle_cfs_rq_soft_quota(struct rq *rq);
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
@@ -734,6 +741,11 @@ struct cfs_rq {
 	KABI_RESERVE(3)
 	KABI_RESERVE(4)
 #endif
+#if defined(CONFIG_SCHED_SOFT_QUOTA)
+	KABI_EXTEND(u64 soft_quota_enable)
+	KABI_EXTEND(u64 sum_soft_runtime)
+	KABI_EXTEND(struct list_head soft_quota_throttled_list)
+#endif
 };
 
 static inline int rt_bandwidth_enabled(void)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0b1c13a05332..738d9a4455c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2831,6 +2831,17 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one_hundred,
 	},
 #endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	{
+		.procname       = "sched_soft_runtime_ratio",
+		.data           = &sysctl_soft_runtime_ratio,
+		.maxlen         = sizeof(sysctl_soft_runtime_ratio),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ONE,
+		.extra2         = &one_hundred,
+	},
+#endif
 #ifdef CONFIG_SCHED_STEAL
 	{
 		.procname       = "sched_max_steal_count",
-- 
2.25.1

    

Cheng Yu

patchwork bot

tags

participants (2)