[PATCH OLK-6.6 v4 2/5] sched: Attach task group to soft domain

28 May 2025

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H

--------------------------------

In the previous patch, we have introduced soft domain. Now, we attach
task group to soft domain, task will be preferentially scheduled into
their associated soft scheduling domains during low-load periods.

To enable the soft domain scheduling feature for a task group, we need
to write '1' to the cpu.soft_domain file in the CPU cgroup subsystem.

This operation will allocate sub-soft_domains matching the CPU quota of
the cgroup(if cpu.cfs_quota_us is -1, Treat it as a 1) to the task
group, subsequently establishing a preferred scheduling domain dedicated
to this group.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/core.c        |  71 ++++++++++
 kernel/sched/fair.c        |  20 +++
 kernel/sched/sched.h       |  22 ++++
 kernel/sched/soft_domain.c | 260 +++++++++++++++++++++++++++++++++++++
 4 files changed, 373 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b3092685f267..961a5f95b128 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11725,6 +11725,58 @@ static inline s64 cpu_tag_read(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_DOMAIN
+
+static int cpu_soft_domain_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cftype,
+				     s64 val)
+{
+	return sched_group_set_soft_domain(css_tg(css), val);
+}
+
+static s64 cpu_soft_domain_read_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cftype)
+{
+	struct task_group *tg = css_tg(css);
+
+	return (s64)tg->sf_ctx->policy;
+}
+
+static int cpu_soft_domain_quota_write_u64(struct cgroup_subsys_state *css,
+					struct cftype *cftype, u64 val)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (tg->sf_ctx->policy != 0)
+		return -EINVAL;
+
+	if (val > cpumask_weight(cpumask_of_node(0)))
+		return -EINVAL;
+
+	tg->sf_ctx->nr_cpus = (int)val;
+
+	return 0;
+}
+
+static u64 cpu_soft_domain_quota_read_u64(struct cgroup_subsys_state *css,
+					  struct cftype *cftype)
+{
+	struct task_group *tg = css_tg(css);
+
+	return (u64)tg->sf_ctx->nr_cpus;
+}
+
+static int soft_domain_cpu_list_seq_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	seq_printf(sf, "%*pbl\n", cpumask_pr_args(to_cpumask(tg->sf_ctx->span)));
+
+	return 0;
+}
+
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -11763,6 +11815,25 @@ static struct cftype cpu_legacy_files[] = {
 		.write_u64 = cpu_rebuild_affinity_domain_u64,
 	},
 #endif
+#ifdef CONFIG_SCHED_SOFT_DOMAIN
+	{
+		.name = "soft_domain",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_soft_domain_read_s64,
+		.write_s64 = cpu_soft_domain_write_s64,
+	},
+	{
+		.name = "soft_domain_nr_cpu",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_soft_domain_quota_read_u64,
+		.write_u64 = cpu_soft_domain_quota_write_u64,
+	},
+	{
+		.name = "soft_domain_cpu_list",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = soft_domain_cpu_list_seq_show,
+	},
+#endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.name = "cfs_quota_us",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7df2b07629c0..775447f31a5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -14730,6 +14730,22 @@ void free_fair_sched_group(struct task_group *tg)
 	kfree(tg->se);
 }
 
+#ifdef CONFIG_SCHED_SOFT_DOMAIN
+int init_soft_domain(struct task_group *tg)
+{
+	struct soft_domain_ctx *sf_ctx = NULL;
+
+	sf_ctx = kzalloc(sizeof(*sf_ctx) + cpumask_size(), GFP_KERNEL);
+	if (!sf_ctx)
+		return -ENOMEM;
+
+	sf_ctx->policy = 0;
+	tg->sf_ctx = sf_ctx;
+
+	return 0;
+}
+#endif
+
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct sched_entity *se;
@@ -14750,6 +14766,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	if (ret)
 		goto err;
 
+	ret = init_soft_domain(tg);
+	if (ret)
+		goto err;
+
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 056a680ae9ed..0dc1fccde30b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,16 @@ struct auto_affinity {
 };
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_DOMAIN
+
+struct soft_domain_ctx {
+	int			policy;
+	int			nr_cpus;
+	struct soft_domain	*sf_d;
+	unsigned long		span[];
+};
+#endif
+
 /* Task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -469,7 +479,11 @@ struct task_group {
 	struct auto_affinity *auto_affinity;
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_DOMAIN
+	KABI_USE(1, struct soft_domain_ctx *sf_ctx)
+#else
 	KABI_RESERVE(1)
+#endif
 	KABI_RESERVE(2)
 	KABI_RESERVE(3)
 	KABI_RESERVE(4)
@@ -3736,6 +3750,10 @@ bool bpf_sched_is_cpu_allowed(struct task_struct *p, int cpu);
 
 #ifdef CONFIG_SCHED_SOFT_DOMAIN
 void build_soft_domain(void);
+int init_soft_domain(struct task_group *tg);
+
+int sched_group_set_soft_domain(struct task_group *tg, long val);
+
 static inline struct cpumask *soft_domain_span(unsigned long span[])
 {
 	return to_cpumask(span);
@@ -3743,6 +3761,10 @@ static inline struct cpumask *soft_domain_span(unsigned long span[])
 #else
 
 static inline void build_soft_domain(void) { }
+static inline int init_soft_domain(struct task_group *tg)
+{
+	return 0;
+}
 
 #endif
 
diff --git a/kernel/sched/soft_domain.c b/kernel/sched/soft_domain.c
index 1be52b056cad..11c2ae1fab95 100644
--- a/kernel/sched/soft_domain.c
+++ b/kernel/sched/soft_domain.c
@@ -15,6 +15,7 @@
  *
  */
 
+#include <linux/sort.h>
 
 static DEFINE_PER_CPU(struct soft_domain *, g_sf_d);
 
@@ -111,3 +112,262 @@ void build_soft_domain(void)
 out:
 	rcu_read_unlock();
 }
+
+static DEFINE_MUTEX(soft_domain_mutex);
+
+#define NR_MAX_CLUSTER 16
+
+struct domain_node {
+	struct soft_subdomain *sud_d;
+	unsigned int attached;
+	unsigned long util;
+};
+
+static int subdomain_cmp(const void *a, const void *b)
+{
+	struct domain_node *ca = (struct domain_node *)a;
+	struct domain_node *cb = (struct domain_node *)b;
+
+	if (ca->attached < cb->attached ||
+	    (ca->attached == cb->attached && ca->util < cb->util))
+		return -1;
+
+	return 1;
+}
+
+struct soft_domain_args {
+	int policy;
+	struct cpumask *cpus;
+};
+
+static int tg_set_soft_domain(struct task_group *tg, void *data)
+{
+	struct soft_domain_args *args = (struct soft_domain_args *)data;
+
+	tg->sf_ctx->policy = args->policy;
+	if (args->policy)
+		cpumask_copy(to_cpumask(tg->sf_ctx->span), args->cpus);
+	else
+		cpumask_clear(to_cpumask(tg->sf_ctx->span));
+
+	return 0;
+}
+
+static int __calc_cpu(struct task_group *tg)
+{
+	int nr_cpu = 1;
+
+	if (tg->sf_ctx->nr_cpus)
+		nr_cpu = tg->sf_ctx->nr_cpus;
+#ifdef CONFIG_CFS_BANDWIDTH
+	else if (tg->cfs_bandwidth.quota != RUNTIME_INF)
+		nr_cpu = max(1, tg->cfs_bandwidth.quota / tg->cfs_bandwidth.period);
+#endif
+
+	tg->sf_ctx->nr_cpus = nr_cpu;
+
+	return nr_cpu;
+}
+
+static unsigned long sum_util(struct cpumask *mask)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	for_each_cpu(cpu, mask)
+		sum += cpu_util_cfs(cpu);
+
+	return sum;
+}
+
+static int __check_policy(struct task_group *tg, void *data)
+{
+	return !!tg->sf_ctx->policy;
+}
+
+static int check_policy(struct task_group *tg, long policy)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = walk_tg_tree_from(tg, __check_policy, tg_nop, NULL);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static struct soft_domain *find_idlest_llc(long policy,
+					    int nr_cpu, cpumask_var_t cpus)
+{
+	int cpu;
+	int max_cpu = 0;
+	struct soft_domain *idlest = NULL;
+
+	/* The user has specified the llc. */
+	if (policy > 0) {
+		cpu = cpumask_first(cpumask_of_node(policy-1));
+		idlest = rcu_dereference(per_cpu(g_sf_d, cpu));
+		return idlest;
+	}
+
+	cpumask_copy(cpus, cpu_active_mask);
+	for_each_cpu(cpu, cpus) {
+		struct soft_domain *sf_d = NULL;
+		unsigned long min_util = ULONG_MAX;
+
+		sf_d = rcu_dereference(per_cpu(g_sf_d, cpu));
+		if (sf_d == NULL)
+			continue;
+
+		/*
+		 * LLC selection order:
+		 * 1. When the number of idle cpus meet the requirements,
+		 *    the one with more idles cpus is better;
+		 * 2. Under the condition of insufficient idle cpus, util
+		 *    is lower, the better.
+		 */
+		if (sf_d->nr_available_cpus > max_cpu &&
+			nr_cpu <= sf_d->nr_available_cpus) {
+			max_cpu = sf_d->nr_available_cpus;
+			idlest = sf_d;
+		} else if (max_cpu == 0) {   /* No llc meets the demand */
+			unsigned long util = sum_util(to_cpumask(sf_d->span));
+
+			if (idlest == NULL || util < min_util) {
+				idlest = sf_d;
+				min_util = util;
+			}
+		}
+
+		cpumask_andnot(cpus, cpus, to_cpumask(sf_d->span));
+	}
+
+	return idlest;
+}
+
+static int __sched_group_set_soft_domain(struct task_group *tg, long policy)
+{
+	int cpu;
+	int ret = 0;
+	cpumask_var_t cpus;
+	int nr_cpu = __calc_cpu(tg);
+	struct soft_domain_args args;
+	struct domain_node nodes[NR_MAX_CLUSTER] = {0};
+
+	if (check_policy(tg, policy))
+		return -EINVAL;
+
+	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+		return -EINVAL;
+
+	scoped_guard (cpus_read_lock) {
+		struct soft_domain *sf_d = NULL;
+
+		rcu_read_lock();
+		/* 1. Find a idlest llc. */
+		sf_d = find_idlest_llc(policy, nr_cpu, cpus);
+		if (sf_d != NULL) {
+			/* 2. select idlest clusters. */
+			struct list_head *children = &sf_d->child_domain;
+			struct soft_subdomain *sub_d = NULL;
+			int nr = 0, i;
+			struct cpumask *tmpmask = NULL;
+			int tmp_cpu = nr_cpu;
+
+			list_for_each_entry(sub_d, children, node) {
+				nodes[nr].sud_d = sub_d;
+				nodes[nr].attached = sub_d->attached;
+				tmpmask = to_cpumask(sub_d->span);
+				cpu = cpumask_first(tmpmask);
+				nodes[nr].util = sum_util(tmpmask);
+				nr++;
+			}
+
+			cpumask_clear(cpus);
+
+			sort(nodes, nr, sizeof(struct domain_node), subdomain_cmp, NULL);
+			sf_d->nr_available_cpus -= min(sf_d->nr_available_cpus, tmp_cpu);
+			for (i = 0; i < nr; i++) {
+				sub_d = nodes[i].sud_d;
+				tmpmask = to_cpumask(sub_d->span);
+				cpumask_or(cpus, cpus, tmpmask);
+				sub_d->attached++;
+				nr_cpu -= cpumask_weight(tmpmask);
+				if (nr_cpu <= 0)
+					break;
+			}
+
+			/* 3. attach task group to softdomain. */
+			args.policy = policy;
+			args.cpus = cpus;
+			walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args);
+
+			/*
+			 * 4. TODO
+			 * add tg to llc domain task_groups list for load balance.
+			 */
+			tg->sf_ctx->sf_d = sf_d;
+		} else {
+			ret = -EINVAL;
+		}
+		rcu_read_unlock();
+	}
+
+	free_cpumask_var(cpus);
+
+	return ret;
+}
+
+static int __sched_group_unset_soft_domain(struct task_group *tg)
+{
+	struct soft_domain_args args = {
+		.policy = 0,
+	};
+	struct soft_domain *sf_d = NULL;
+	struct soft_subdomain *sub_d = NULL;
+	struct list_head *children = NULL;
+
+	/* If parent has set soft domain, child group can't unset itself. */
+	if (tg->parent->sf_ctx->policy != 0)
+		return -EINVAL;
+
+	sf_d = tg->sf_ctx->sf_d;
+	sf_d->nr_available_cpus += __calc_cpu(tg);
+	children = &sf_d->child_domain;
+
+	list_for_each_entry(sub_d, children, node) {
+		if (cpumask_intersects(to_cpumask(tg->sf_ctx->span), to_cpumask(sub_d->span)))
+			sub_d->attached--;
+	}
+
+	walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args);
+
+	return 0;
+}
+
+int sched_group_set_soft_domain(struct task_group *tg, long val)
+{
+	int ret = 0;
+
+	if (val < -1 || val > nr_node_ids)
+		return -EINVAL;
+
+	mutex_lock(&soft_domain_mutex);
+
+	/* If enable or disable is repeated, directly return. */
+	if (!!tg->sf_ctx->policy == !!val)
+		goto out;
+
+	if (val == 0)
+		ret = __sched_group_unset_soft_domain(tg);
+	else
+		ret = __sched_group_set_soft_domain(tg, val);
+
+	if (!ret)
+		tg->sf_ctx->policy = val;
+
+out:
+	mutex_unlock(&soft_domain_mutex);
+
+	return ret;
+}
-- 
2.18.0.huawei.25