[PATCH OLK-5.10 V3 3/8] sched/core: Add cpu.steal_task in cgroup v1 cpu subsystem

30 Sep 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ

-----------------------------------------

We add a new cpu.steal_task interface in cgroup v1 for a certain cgroup.
The default value is 0, which means it is disabled, and 1, which means
it is enabled.

If we want to enable the steal task feature for a cgroup, we first need
to configure group_steal in cmdline, then configure STEAL in
sched_feature, and finally configure the value of cpu.steal_task to 1.

Signed-off-by: Cheng Yu <serein.chengyu@huawei.com>
---
 include/linux/sched.h |   4 ++
 kernel/sched/core.c   | 108 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h  |  19 +++++++-
 3 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97c216bfb0fc..57de624f17a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -522,7 +522,11 @@ struct sched_entity {
 #else
 	KABI_RESERVE(1)
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	KABI_USE(2, int steal_task)
+#else
 	KABI_RESERVE(2)
+#endif
 	KABI_RESERVE(3)
 	KABI_RESERVE(4)
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7595a3fef28f..900637a6ac09 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8278,6 +8278,9 @@ void __init sched_init(void)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
 		root_task_group.smt_expell = TG_SMT_EXPELL;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		root_task_group.steal_task = TG_STEAL_NO;
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
@@ -8718,6 +8721,20 @@ static void sched_free_group(struct task_group *tg)
 	kmem_cache_free(task_group_cache, tg);
 }
 
+#ifdef CONFIG_SCHED_STEAL
+static void sched_change_steal_group(struct task_struct *tsk, struct task_group *tg)
+{
+	struct sched_entity *se = &tsk->se;
+
+	se->steal_task = tg->steal_task;
+}
+
+static inline void tg_init_steal(struct task_group *tg, struct task_group *ptg)
+{
+	tg->steal_task = ptg->steal_task;
+}
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg)
 {
@@ -8745,6 +8762,10 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+#ifdef CONFIG_SCHED_STEAL
+	tg_init_steal(tg, parent);
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 	tg_init_tag(tg, parent);
 #endif
@@ -8820,6 +8841,10 @@ static void sched_change_group(struct task_struct *tsk, int type)
 	sched_change_qos_group(tsk, tg);
 #endif
 
+#ifdef CONFIG_SCHED_STEAL
+	sched_change_steal_group(tsk, tg);
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 	/*
 	 * This function has cleared and restored the task status,
@@ -9784,6 +9809,81 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_SCHED_STEAL
+static inline s64 cpu_steal_task_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	return css_tg(css)->steal_task;
+}
+
+void sched_setsteal(struct task_struct *tsk, s64 steal_task)
+{
+	struct sched_entity *se = &tsk->se;
+	int queued, running, queue_flags =
+			DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+	struct rq_flags rf;
+	struct rq *rq;
+
+	if (se->steal_task == steal_task)
+		return;
+
+	rq = task_rq_lock(tsk, &rf);
+
+	running = task_current(rq, tsk);
+	queued = task_on_rq_queued(tsk);
+
+	update_rq_clock(rq);
+	if (queued)
+		dequeue_task(rq, tsk, queue_flags);
+	if (running)
+		put_prev_task(rq, tsk);
+
+	se->steal_task = steal_task;
+
+	if (queued)
+		enqueue_task(rq, tsk, queue_flags);
+	if (running)
+		set_next_task(rq, tsk);
+
+	task_rq_unlock(rq, tsk, &rf);
+}
+
+int tg_change_steal(struct task_group *tg, void *data)
+{
+	struct css_task_iter it;
+	struct task_struct *tsk;
+	s64 steal_task = *(s64 *)data;
+	struct cgroup_subsys_state *css = &tg->css;
+
+	tg->steal_task = steal_task;
+
+	css_task_iter_start(css, 0, &it);
+	while ((tsk = css_task_iter_next(&it)))
+		sched_setsteal(tsk, steal_task);
+	css_task_iter_end(&it);
+
+	return 0;
+}
+
+static int cpu_steal_task_write(struct cgroup_subsys_state *css,
+				struct cftype *cftype, s64 steal_task)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (!group_steal_used())
+		return -EPERM;
+
+	if (steal_task < TG_STEAL_NO || steal_task > TG_STEAL)
+		return -EINVAL;
+
+	rcu_read_lock();
+	walk_tg_tree_from(tg, tg_change_steal, tg_nop, (void *)(&steal_task));
+	rcu_read_unlock();
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 void sched_settag(struct task_struct *tsk, s64 tag)
 {
@@ -9950,6 +10050,14 @@ static struct cftype cpu_legacy_files[] = {
 		.write_s64 = cpu_smt_expell_write,
 	},
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	{
+		.name = "steal_task",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_steal_task_read,
+		.write_s64 = cpu_steal_task_write,
+	},
+#endif
 #ifdef CONFIG_BPF_SCHED
 	{
 		.name = "tag",
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 87c63fa5625e..ceea107a1dc8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -402,7 +402,6 @@ struct cfs_bandwidth {
 #endif
 };
 
-
 #ifdef CONFIG_QOS_SCHED_SMART_GRID
 #define AD_LEVEL_MAX		8
 
@@ -497,7 +496,13 @@ struct task_group {
 #else
 	KABI_RESERVE(2)
 #endif
+
+#ifdef CONFIG_SCHED_STEAL
+	KABI_USE(3, int steal_task)
+#else
 	KABI_RESERVE(3)
+#endif
+
 #if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__)
 	KABI_USE(4, struct auto_affinity *auto_affinity)
 #else
@@ -505,6 +510,18 @@ struct task_group {
 #endif
 };
 
+#ifdef CONFIG_SCHED_STEAL
+enum tg_steal_task {
+	TG_STEAL_NO = 0,
+	TG_STEAL = 1,
+};
+
+static inline bool is_tg_steal(int steal_task)
+{
+	return steal_task == TG_STEAL;
+}
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD
 
-- 
2.25.1