[PATCH OLK-5.10 0/7] Introduce soft domain

Zhang Qiao

18 Jun 2025 18 Jun '25

4:41 p.m.

Zhang Qiao (7): sched: topology: Build soft domain for LLC sched: Attach task group to soft domain sched: fair: Select idle cpu in soft domain sched: fair: Disable numa migrate for soft domian task sched: Add cmdline sched_soft_domain switch for soft domain feature config: Configurate CONFIG_SCHED_SOFT_DOMAIN sched: Balance newly task to soft domain arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/sched/topology.h | 21 ++ init/Kconfig | 12 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 84 +++++ kernel/sched/fair.c | 142 +++++++ kernel/sched/features.h | 4 + kernel/sched/sched.h | 41 ++ kernel/sched/soft_domain.c | 496 +++++++++++++++++++++++++ 10 files changed, 803 insertions(+) create mode 100644 kernel/sched/soft_domain.c -- 2.18.0.huawei.25

Show replies by date

patchwork bot

18 Jun 18 Jun

4:26 p.m.

反馈：您发送到kernel@openeuler.org的补丁/补丁集，已成功转换为PR！ PR链接地址： https://gitee.com/openeuler/kernel/pulls/16745 邮件列表地址：https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/3G2... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/16745 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/3G2...

Zhang Qiao

4:41 p.m.

New subject: [PATCH OLK-5.10 1/7] sched: topology: Build soft domain for LLC

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H -------------------------------- On Kunpeng server, each LLC domain contains multiple clusters. When multiple services are deployed within the same LLC domain, their tasks become distributed across all clusters. This results in: 1. High cache synchronization overheadbetween different tasks of the same service. 2. Severe cache contention among tasks from different services. The Soft Domain architecture partitions resources by clusters. Under low-load conditions, each service operates exclusively within its dedicated domain to prevent cross-service interference, thereby enhancing both CPU isolation and improving cache locality. Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- include/linux/sched/topology.h | 21 ++++++ init/Kconfig | 12 ++++ kernel/sched/Makefile | 1 + kernel/sched/core.c | 1 + kernel/sched/sched.h | 11 ++++ kernel/sched/soft_domain.c | 114 +++++++++++++++++++++++++++++++++ 6 files changed, 160 insertions(+) create mode 100644 kernel/sched/soft_domain.c diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c60eea1c805e..31b9ccf21d5e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -78,6 +78,27 @@ extern int sched_domain_level_max; struct sched_group; +#ifdef CONFIG_SCHED_SOFT_DOMAIN + +struct soft_subdomain { + /* the count of task group attached this sub domain. */ + int attached; + struct list_head node; + unsigned long span[]; +}; + +/* + * Each LLC builds a soft domain: + * A soft scheduling domain is divided into multiple subdomains, + * typically based on the physical structure of CPU clusters. + */ +struct soft_domain { + struct list_head child_domain; + int nr_available_cpus; + unsigned long span[]; +}; +#endif + struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; diff --git a/init/Kconfig b/init/Kconfig index 5f88cce193e8..7fcda138c76f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1115,6 +1115,18 @@ config QOS_SCHED_NUMA_ICON If in doubt, say N. +config SCHED_SOFT_DOMAIN + bool "Soft domain scheduler" + depends on FAIR_GROUP_SCHED + depends on SCHED_CLUSTER + default n + help + This feature builds a CPU soft domain for each task group. Tasks are + prioritized and aggregated to execute within soft domains, which optimizes + resource allocation and enhances cache locality. + + If in doubt, say N. + config UCLAMP_TASK_GROUP bool "Utilization clamping per group of tasks" depends on CGROUP_SCHED diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ff9ff2c17f79..cc5d2cc388bf 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -42,3 +42,4 @@ obj-$(CONFIG_BPF_SCHED) += bpf_topology.o obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o relationship_ioctl.o obj-$(CONFIG_QOS_SCHED_NUMA_ICON) += numa_icon.o +obj-$(CONFIG_SCHED_SOFT_DOMAIN) += soft_domain.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 457eeebc7b62..73ce0ce36c83 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8202,6 +8202,7 @@ void __init sched_init_smp(void) sched_smp_initialized = true; sched_grid_zone_init(); + build_soft_domain(); #ifdef CONFIG_QOS_SCHED_SMART_GRID init_auto_affinity(&root_task_group); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fe6342305b0f..6232148c5099 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3221,3 +3221,14 @@ static __always_inline int task_has_qos_idle_policy(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +#ifdef CONFIG_SCHED_SOFT_DOMAIN +void build_soft_domain(void); +static inline struct cpumask *soft_domain_span(unsigned long span[]) +{ + return to_cpumask(span); +} +#else + +static inline void build_soft_domain(void) { } + +#endif diff --git a/kernel/sched/soft_domain.c b/kernel/sched/soft_domain.c new file mode 100644 index 000000000000..ea4754a3ee65 --- /dev/null +++ b/kernel/sched/soft_domain.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for Soft Domain Scheduling + * + * Copyright (C) 2025-2025 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include "sched.h" + +static DEFINE_PER_CPU(struct soft_domain *, g_sf_d); + +static void free_sub_soft_domain(struct soft_domain *sf_d); + +static int build_soft_sub_domain(struct sched_domain *sd, struct cpumask *cpus) +{ + struct cpumask *span = sched_domain_span(sd); + int nid = cpu_to_node(cpumask_first(span)); + struct soft_domain *sf_d = NULL; + int i; + + sf_d = kzalloc_node(sizeof(struct soft_domain) + cpumask_size(), + GFP_KERNEL, nid); + if (!sf_d) + return -ENOMEM; + + INIT_LIST_HEAD(&sf_d->child_domain); + sf_d->nr_available_cpus = cpumask_weight(span); + cpumask_copy(to_cpumask(sf_d->span), span); + + for_each_cpu_and(i, sched_domain_span(sd), cpus) { + struct soft_subdomain *sub_d = NULL; + + sub_d = kzalloc_node(sizeof(struct soft_subdomain) + cpumask_size(), + GFP_KERNEL, nid); + if (!sub_d) { + free_sub_soft_domain(sf_d); + return -ENOMEM; + } + + list_add_tail(&sub_d->node, &sf_d->child_domain); + cpumask_copy(soft_domain_span(sub_d->span), cpu_clustergroup_mask(i)); + cpumask_andnot(cpus, cpus, cpu_clustergroup_mask(i)); + } + + for_each_cpu(i, sched_domain_span(sd)) { + rcu_assign_pointer(per_cpu(g_sf_d, i), sf_d); + } + + return 0; +} + +static void free_sub_soft_domain(struct soft_domain *sf_d) +{ + struct list_head *children = &sf_d->child_domain; + struct soft_subdomain *entry = NULL, *next = NULL; + int i; + + list_for_each_entry_safe(entry, next, children, node) { + list_del(&entry->node); + kfree(entry); + } + + for_each_cpu(i, to_cpumask(sf_d->span)) { + rcu_assign_pointer(per_cpu(g_sf_d, i), NULL); + } + + kfree(sf_d); +} + +static void free_soft_domain(void) +{ + struct soft_domain *sf_d = NULL; + int i; + + for_each_cpu(i, cpu_active_mask) { + sf_d = rcu_dereference(per_cpu(g_sf_d, i)); + if (sf_d) + free_sub_soft_domain(sf_d); + } +} + +void build_soft_domain(void) +{ + struct sched_domain *sd; + static struct cpumask cpus; + int i, ret; + + cpumask_copy(&cpus, cpu_active_mask); + rcu_read_lock(); + for_each_cpu(i, &cpus) { + /* build soft domain for each llc domain. */ + sd = rcu_dereference(per_cpu(sd_llc, i)); + if (sd) { + ret = build_soft_sub_domain(sd, &cpus); + if (ret) { + free_soft_domain(); + goto out; + } + } + } + +out: + rcu_read_unlock(); +} -- 2.18.0.huawei.25

Zhang Qiao

4:41 p.m.

New subject: [PATCH OLK-5.10 2/7] sched: Attach task group to soft domain

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H -------------------------------- In the previous patch, we have introduced soft domain. Now, we attach task group to soft domain, task will be preferentially scheduled into their associated soft scheduling domains during low-load periods. To enable the soft domain scheduling feature for a task group, we need to write '1' to the cpu.soft_domain file in the CPU cgroup subsystem. This operation will allocate sub-soft_domains matching the CPU quota of the cgroup(if cpu.cfs_quota_us is -1, Treat it as a 1) to the task group, subsequently establishing a preferred scheduling domain dedicated to this group. Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/core.c | 74 ++++++++ kernel/sched/fair.c | 6 + kernel/sched/sched.h | 30 ++++ kernel/sched/soft_domain.c | 346 ++++++++++++++++++++++++++++++++++++- 4 files changed, 452 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 73ce0ce36c83..e23a81913984 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8951,6 +8951,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + offline_soft_domain(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -9977,6 +9984,53 @@ static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN + +static int cpu_soft_domain_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, + s64 val) +{ + return sched_group_set_soft_domain(css_tg(css), val); +} + +static s64 cpu_soft_domain_read_s64(struct cgroup_subsys_state *css, + struct cftype *cftype) +{ + struct task_group *tg = css_tg(css); + + return (s64)tg->sf_ctx->policy; +} + +static int cpu_soft_domain_quota_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) +{ + struct task_group *tg = css_tg(css); + + if (val > cpumask_weight(cpumask_of_node(0))) + return -EINVAL; + + return sched_group_set_soft_domain_quota(tg, val); +} + +static u64 cpu_soft_domain_quota_read_u64(struct cgroup_subsys_state *css, + struct cftype *cftype) +{ + struct task_group *tg = css_tg(css); + + return (u64)tg->sf_ctx->nr_cpus; +} + +static int soft_domain_cpu_list_seq_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + seq_printf(sf, "%*pbl\n", cpumask_pr_args(to_cpumask(tg->sf_ctx->span))); + + return 0; +} + +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10010,6 +10064,25 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_rebuild_affinity_domain_u64, }, #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN + { + .name = "soft_domain", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_soft_domain_read_s64, + .write_s64 = cpu_soft_domain_write_s64, + }, + { + .name = "soft_domain_nr_cpu", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_soft_domain_quota_read_u64, + .write_u64 = cpu_soft_domain_quota_write_u64, + }, + { + .name = "soft_domain_cpu_list", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = soft_domain_cpu_list_seq_show, + }, +#endif #ifdef CONFIG_CFS_BANDWIDTH { .name = "cfs_quota_us", @@ -10381,6 +10454,7 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7544a14225c..eb5a51218334 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13826,6 +13826,7 @@ void free_fair_sched_group(struct task_group *tg) destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); destroy_auto_affinity(tg); + destroy_soft_domain(tg); for_each_possible_cpu(i) { #ifdef CONFIG_QOS_SCHED @@ -13862,6 +13863,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (ret) goto err; + ret = init_soft_domain(tg, parent); + if (ret) + goto err; + for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); @@ -13884,6 +13889,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) kfree(cfs_rq); err: destroy_auto_affinity(tg); + destroy_soft_domain(tg); return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6232148c5099..fd4de53fb5b4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -430,6 +430,16 @@ struct auto_affinity { #endif }; +#ifdef CONFIG_SCHED_SOFT_DOMAIN + +struct soft_domain_ctx { + int policy; + int nr_cpus; + struct soft_domain *sf_d; + unsigned long span[]; +}; +#endif + /* Task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -508,6 +518,9 @@ struct task_group { #else KABI_RESERVE(4) #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN + KABI_EXTEND(struct soft_domain_ctx *sf_ctx) +#endif }; #ifdef CONFIG_SCHED_STEAL @@ -3223,6 +3236,12 @@ void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); #ifdef CONFIG_SCHED_SOFT_DOMAIN void build_soft_domain(void); +int init_soft_domain(struct task_group *tg, struct task_group *parent); +int destroy_soft_domain(struct task_group *tg); +void offline_soft_domain(struct task_group *tg); +int sched_group_set_soft_domain(struct task_group *tg, long val); +int sched_group_set_soft_domain_quota(struct task_group *tg, long val); + static inline struct cpumask *soft_domain_span(unsigned long span[]) { return to_cpumask(span); @@ -3230,5 +3249,16 @@ static inline struct cpumask *soft_domain_span(unsigned long span[]) #else static inline void build_soft_domain(void) { } +static inline int init_soft_domain(struct task_group *tg, struct task_group *parent) +{ + return 0; +} + +static inline void offline_soft_domain(struct task_group *tg) { } + +static inline int destroy_soft_domain(struct task_group *tg) +{ + return 0; +} #endif diff --git a/kernel/sched/soft_domain.c b/kernel/sched/soft_domain.c index ea4754a3ee65..f20ff3b54fdc 100644 --- a/kernel/sched/soft_domain.c +++ b/kernel/sched/soft_domain.c @@ -16,6 +16,7 @@ */ #include "sched.h" +#include <linux/sort.h> static DEFINE_PER_CPU(struct soft_domain *, g_sf_d); @@ -37,7 +38,7 @@ static int build_soft_sub_domain(struct sched_domain *sd, struct cpumask *cpus) sf_d->nr_available_cpus = cpumask_weight(span); cpumask_copy(to_cpumask(sf_d->span), span); - for_each_cpu_and(i, sched_domain_span(sd), cpus) { + for_each_cpu_and(i, span, cpus) { struct soft_subdomain *sub_d = NULL; sub_d = kzalloc_node(sizeof(struct soft_subdomain) + cpumask_size(), @@ -46,13 +47,12 @@ static int build_soft_sub_domain(struct sched_domain *sd, struct cpumask *cpus) free_sub_soft_domain(sf_d); return -ENOMEM; } - list_add_tail(&sub_d->node, &sf_d->child_domain); - cpumask_copy(soft_domain_span(sub_d->span), cpu_clustergroup_mask(i)); + cpumask_and(soft_domain_span(sub_d->span), span, cpu_clustergroup_mask(i)); cpumask_andnot(cpus, cpus, cpu_clustergroup_mask(i)); } - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu(i, span) { rcu_assign_pointer(per_cpu(g_sf_d, i), sf_d); } @@ -112,3 +112,341 @@ void build_soft_domain(void) out: rcu_read_unlock(); } + +static DEFINE_MUTEX(soft_domain_mutex); + +#define NR_MAX_CLUSTER 16 + +struct domain_node { + struct soft_subdomain *sud_d; + unsigned int attached; + unsigned long util; +}; + +static int subdomain_cmp(const void *a, const void *b) +{ + struct domain_node *ca = (struct domain_node *)a; + struct domain_node *cb = (struct domain_node *)b; + + if (ca->attached < cb->attached || + (ca->attached == cb->attached && ca->util < cb->util)) + return -1; + + return 1; +} + +struct soft_domain_args { + int policy; + int nr_cpu; + struct cpumask *cpus; +}; + +static int tg_set_soft_domain(struct task_group *tg, void *data) +{ + struct soft_domain_args *args = (struct soft_domain_args *)data; + + tg->sf_ctx->policy = args->policy; + if (args->policy) { + cpumask_copy(to_cpumask(tg->sf_ctx->span), args->cpus); + tg->sf_ctx->nr_cpus = args->nr_cpu; + } else + cpumask_clear(to_cpumask(tg->sf_ctx->span)); + + return 0; +} + +static int __calc_cpu(struct task_group *tg) +{ + int nr_cpu = 1; + + if (tg->sf_ctx->nr_cpus) + nr_cpu = tg->sf_ctx->nr_cpus; +#ifdef CONFIG_CFS_BANDWIDTH + else if (tg->cfs_bandwidth.quota != RUNTIME_INF) + nr_cpu = DIV_ROUND_UP_ULL(tg->cfs_bandwidth.quota, tg->cfs_bandwidth.period); +#endif + + return nr_cpu; +} + +static unsigned long sum_util(struct cpumask *mask) +{ + unsigned long sum = 0; + int cpu; + + for_each_cpu(cpu, mask) + sum += cpu_util_cfs(cpu_rq(cpu)); + + return sum; +} + +static int __check_policy(struct task_group *tg, void *data) +{ + return !!tg->sf_ctx->policy; +} + +static int check_policy(struct task_group *tg, long policy) +{ + int ret; + + rcu_read_lock(); + ret = walk_tg_tree_from(tg, __check_policy, tg_nop, NULL); + rcu_read_unlock(); + + return ret; +} + +static struct soft_domain *find_idlest_llc(long policy, + int nr_cpu, cpumask_var_t cpus) +{ + int cpu; + int max_cpu = 0; + struct soft_domain *idlest = NULL; + unsigned long min_util = ULONG_MAX; + + /* The user has specified the llc. */ + if (policy > 0) { + for_each_cpu(cpu, cpumask_of_node(policy-1)) { + idlest = rcu_dereference(per_cpu(g_sf_d, cpu)); + if (idlest != NULL) + break; + } + + if (idlest && nr_cpu <= cpumask_weight(to_cpumask(idlest->span))) + return idlest; + + return NULL; + } + + cpumask_copy(cpus, cpu_active_mask); + for_each_cpu(cpu, cpus) { + struct soft_domain *sf_d = NULL; + struct cpumask *mask; + + sf_d = rcu_dereference(per_cpu(g_sf_d, cpu)); + if (sf_d == NULL) + continue; + + mask = to_cpumask(sf_d->span); + cpumask_andnot(cpus, cpus, mask); + if (nr_cpu > cpumask_weight(mask)) + continue; + + /* + * LLC selection order: + * 1. When the number of idle cpus meet the requirements, + * the one with more idles cpus is better; + * 2. Under the condition of insufficient idle cpus, util + * is lower, the better. + */ + if (sf_d->nr_available_cpus > max_cpu && + nr_cpu <= sf_d->nr_available_cpus) { + max_cpu = sf_d->nr_available_cpus; + idlest = sf_d; + } else if (max_cpu == 0) { /* No llc meets the demand */ + unsigned long util = sum_util(mask); + + if (idlest == NULL || util < min_util) { + idlest = sf_d; + min_util = util; + } + } + } + + return idlest; +} + +static int __sched_group_set_soft_domain(struct task_group *tg, long policy) +{ + int cpu; + int ret = 0; + cpumask_var_t cpus; + struct soft_domain_args args; + struct soft_domain *sf_d = NULL; + struct domain_node nodes[NR_MAX_CLUSTER] = {0}; + int nr_cpu = __calc_cpu(tg); + + if (check_policy(tg, policy)) + return -EINVAL; + + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return -EINVAL; + + rcu_read_lock(); + /* 1. Find a idlest llc. */ + sf_d = find_idlest_llc(policy, nr_cpu, cpus); + if (sf_d != NULL) { + /* 2. select idlest clusters. */ + struct list_head *children = &sf_d->child_domain; + struct soft_subdomain *sub_d = NULL; + int nr = 0, i; + struct cpumask *tmpmask = NULL; + int tmp_cpu = nr_cpu; + + list_for_each_entry(sub_d, children, node) { + nodes[nr].sud_d = sub_d; + nodes[nr].attached = sub_d->attached; + tmpmask = to_cpumask(sub_d->span); + cpu = cpumask_first(tmpmask); + nodes[nr].util = sum_util(tmpmask); + nr++; + } + + cpumask_clear(cpus); + + sort(nodes, nr, sizeof(struct domain_node), subdomain_cmp, NULL); + sf_d->nr_available_cpus -= tmp_cpu; + for (i = 0; i < nr; i++) { + sub_d = nodes[i].sud_d; + tmpmask = to_cpumask(sub_d->span); + cpumask_or(cpus, cpus, tmpmask); + sub_d->attached++; + nr_cpu -= cpumask_weight(tmpmask); + if (nr_cpu <= 0) + break; + } + + /* 3. attach task group to softdomain. */ + args.policy = policy; + args.cpus = cpus; + args.nr_cpu = tmp_cpu; + walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args); + + /* + * 4.add tg to llc domain task_groups list for load balance. + */ + tg->sf_ctx->nr_cpus = tmp_cpu; + tg->sf_ctx->sf_d = sf_d; + } else { + ret = -EINVAL; + } + rcu_read_unlock(); + + free_cpumask_var(cpus); + + return ret; +} + +static int __sched_group_unset_soft_domain(struct task_group *tg) +{ + struct soft_domain_args args = { + .policy = 0, + }; + struct soft_domain *sf_d = NULL; + struct soft_subdomain *sub_d = NULL; + struct list_head *children = NULL; + + /* If parent has set soft domain, child group can't unset itself. */ + if (tg->parent->sf_ctx != NULL && tg->parent->sf_ctx->policy != 0) + return -EINVAL; + + sf_d = tg->sf_ctx->sf_d; + sf_d->nr_available_cpus += __calc_cpu(tg); + children = &sf_d->child_domain; + + list_for_each_entry(sub_d, children, node) { + if (cpumask_intersects(to_cpumask(tg->sf_ctx->span), to_cpumask(sub_d->span))) + sub_d->attached--; + } + + walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args); + + return 0; +} + +int sched_group_set_soft_domain(struct task_group *tg, long val) +{ + int ret = 0; + + if (val < -1 || val > nr_node_ids) + return -EINVAL; + + mutex_lock(&soft_domain_mutex); + + /* If enable or disable is repeated, directly return. */ + if (!!tg->sf_ctx->policy == !!val) + goto out; + + if (val == 0) + ret = __sched_group_unset_soft_domain(tg); + else + ret = __sched_group_set_soft_domain(tg, val); + + if (!ret) + tg->sf_ctx->policy = val; + +out: + mutex_unlock(&soft_domain_mutex); + + return ret; +} + +int sched_group_set_soft_domain_quota(struct task_group *tg, long val) +{ + int ret = 0; + + mutex_lock(&soft_domain_mutex); + if (tg->sf_ctx->policy != 0) { + ret = -EINVAL; + goto out; + } else + tg->sf_ctx->nr_cpus = (int)val; + +out: + mutex_unlock(&soft_domain_mutex); + + return ret; +} + +int init_soft_domain(struct task_group *tg, struct task_group *parent) +{ + struct soft_domain_ctx *sf_ctx = NULL; + struct soft_domain_ctx *psf_ctx = NULL; + + sf_ctx = kzalloc(sizeof(*sf_ctx) + cpumask_size(), GFP_KERNEL); + if (!sf_ctx) + return -ENOMEM; + + mutex_lock(&soft_domain_mutex); + psf_ctx = parent->sf_ctx; + if (psf_ctx) { + sf_ctx->policy = psf_ctx->policy; + sf_ctx->nr_cpus = psf_ctx->nr_cpus; + cpumask_copy(to_cpumask(sf_ctx->span), to_cpumask(psf_ctx->span)); + } + + tg->sf_ctx = sf_ctx; + mutex_unlock(&soft_domain_mutex); + + return 0; +} + +void offline_soft_domain(struct task_group *tg) +{ + struct soft_domain_ctx *sf_ctx = NULL; + struct soft_domain_ctx *psf_ctx = NULL; + + sf_ctx = tg->sf_ctx; + psf_ctx = tg->parent->sf_ctx; + + if (!sf_ctx) + return; + + mutex_lock(&soft_domain_mutex); + if (sf_ctx->policy != 0) { + /* + * parent group is not set, this group set + * soft domain by user. + */ + if (psf_ctx == NULL || psf_ctx->policy == 0) + __sched_group_unset_soft_domain(tg); + } + mutex_unlock(&soft_domain_mutex); +} + +int destroy_soft_domain(struct task_group *tg) +{ + kfree(tg->sf_ctx); + + return 0; +} -- 2.18.0.huawei.25

Zhang Qiao

4:41 p.m.

New subject: [PATCH OLK-5.10 3/7] sched: fair: Select idle cpu in soft domain

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H -------------------------------- Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/fair.c | 66 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 4 +++ 2 files changed, 70 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eb5a51218334..e94702c157c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7419,6 +7419,40 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } } +#ifdef CONFIG_SCHED_SOFT_DOMAIN + if (sched_feat(SOFT_DOMAIN)) { + struct task_group *tg = task_group(p); + + if (tg->sf_ctx && tg->sf_ctx->policy != 0) { + struct cpumask *tmpmask = to_cpumask(tg->sf_ctx->span); + + for_each_cpu_wrap(cpu, tmpmask, target + 1) { + if (!cpumask_test_cpu(cpu, tmpmask)) + continue; + + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + + if (idle_cpu != -1) + return idle_cpu; + + cpumask_andnot(cpus, cpus, tmpmask); + } + + } +#endif + if (static_branch_unlikely(&sched_cluster_active)) { struct sched_group *sg = sd->groups; @@ -8205,6 +8239,33 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, } #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN +static int wake_soft_domain(struct task_struct *p, int target) +{ + struct cpumask *mask = this_cpu_cpumask_var_ptr(select_idle_mask); + struct soft_domain_ctx *ctx = NULL; + + ctx = task_group(p)->sf_ctx; + if (!ctx || ctx->policy == 0) + goto out; + +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE + cpumask_and(mask, to_cpumask(ctx->span), p->select_cpus); +#else + cpumask_and(mask, to_cpumask(ctx->span), p->cpus_ptr); +#endif + cpumask_and(mask, mask, cpu_active_mask); + if (cpumask_empty(mask) || cpumask_test_cpu(target, mask)) + goto out; + else + target = cpumask_any_and_distribute(mask, mask); + +out: + + return target; +} +#endif + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -8267,6 +8328,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } rcu_read_lock(); + +#ifdef CONFIG_SCHED_SOFT_DOMAIN + if (sched_feat(SOFT_DOMAIN)) + new_cpu = prev_cpu = wake_soft_domain(p, prev_cpu); +#endif #ifdef CONFIG_BPF_SCHED if (bpf_sched_enabled()) { ctx.task = p; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1fd89af55681..34a5e3ce85e1 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -121,3 +121,7 @@ SCHED_FEAT(BASE_SLICE, true) */ SCHED_FEAT(DA_UTIL_TASKGROUP, true) #endif + +#ifdef CONFIG_SCHED_SOFT_DOMAIN +SCHED_FEAT(SOFT_DOMAIN, false) +#endif -- 2.18.0.huawei.25

Zhang Qiao

4:41 p.m.

New subject: [PATCH OLK-5.10 4/7] sched: fair: Disable numa migrate for soft domian task

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H -------------------------------- Currently, for soft-domain task, numa migration is not yet implemented. Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e94702c157c7..56f407770be0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9947,6 +9947,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; +#ifdef CONFIG_SCHED_SOFT_DOMAIN + /* Do not migrate soft domain tasks to outside of prefer cluster. */ + if (sched_feat(SOFT_DOMAIN)) { + struct soft_domain_ctx *ctx = task_group(p)->sf_ctx; + + if (ctx && ctx->policy && + !cpumask_test_cpu(env->dst_cpu, to_cpumask(ctx->span))) + return 0; + } +#endif + /* Disregard pcpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; -- 2.18.0.huawei.25

Zhang Qiao

4:41 p.m.

New subject: [PATCH OLK-5.10 5/7] sched: Add cmdline sched_soft_domain switch for soft domain feature

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICB7K1 -------------------------------- Addd a command-line "sched_soft_domain" switch for the soft domain feature; this switch is enabled by default. Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/core.c | 9 ++++++++ kernel/sched/soft_domain.c | 44 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e23a81913984..07101f6b10a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9998,6 +9998,9 @@ static s64 cpu_soft_domain_read_s64(struct cgroup_subsys_state *css, { struct task_group *tg = css_tg(css); + if (!tg->sf_ctx) + return 0; + return (s64)tg->sf_ctx->policy; } @@ -10017,6 +10020,9 @@ static u64 cpu_soft_domain_quota_read_u64(struct cgroup_subsys_state *css, { struct task_group *tg = css_tg(css); + if (!tg->sf_ctx) + return 0; + return (u64)tg->sf_ctx->nr_cpus; } @@ -10024,6 +10030,9 @@ static int soft_domain_cpu_list_seq_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); + if (!tg->sf_ctx) + return 0; + seq_printf(sf, "%*pbl\n", cpumask_pr_args(to_cpumask(tg->sf_ctx->span))); return 0; diff --git a/kernel/sched/soft_domain.c b/kernel/sched/soft_domain.c index f20ff3b54fdc..b8bbfef42b23 100644 --- a/kernel/sched/soft_domain.c +++ b/kernel/sched/soft_domain.c @@ -18,6 +18,30 @@ #include "sched.h" #include <linux/sort.h> +static DEFINE_STATIC_KEY_TRUE(__soft_domain_switch); + +static int __init soft_domain_switch_setup(char *str) +{ + int val = 0; + + if (kstrtoint(str, 0, &val)) + pr_warn("sched_soft_domain parameter is error: %s\n", str); + else { + if (val == 1) + static_branch_enable(&__soft_domain_switch); + else if (val == 0) + static_branch_disable(&__soft_domain_switch); + } + + return 1; +} +__setup("sched_soft_domain=", soft_domain_switch_setup); + +static bool soft_domain_enabled(void) +{ + return static_branch_likely(&__soft_domain_switch); +} + static DEFINE_PER_CPU(struct soft_domain *, g_sf_d); static void free_sub_soft_domain(struct soft_domain *sf_d); @@ -87,6 +111,8 @@ static void free_soft_domain(void) if (sf_d) free_sub_soft_domain(sf_d); } + + static_branch_disable(&__soft_domain_switch); } void build_soft_domain(void) @@ -95,6 +121,9 @@ void build_soft_domain(void) static struct cpumask cpus; int i, ret; + if (!soft_domain_enabled()) + return; + cpumask_copy(&cpus, cpu_active_mask); rcu_read_lock(); for_each_cpu(i, &cpus) { @@ -358,6 +387,9 @@ int sched_group_set_soft_domain(struct task_group *tg, long val) { int ret = 0; + if (!soft_domain_enabled()) + return -EPERM; + if (val < -1 || val > nr_node_ids) return -EINVAL; @@ -385,6 +417,9 @@ int sched_group_set_soft_domain_quota(struct task_group *tg, long val) { int ret = 0; + if (!soft_domain_enabled()) + return -EPERM; + mutex_lock(&soft_domain_mutex); if (tg->sf_ctx->policy != 0) { ret = -EINVAL; @@ -403,6 +438,9 @@ int init_soft_domain(struct task_group *tg, struct task_group *parent) struct soft_domain_ctx *sf_ctx = NULL; struct soft_domain_ctx *psf_ctx = NULL; + if (!soft_domain_enabled()) + return 0; + sf_ctx = kzalloc(sizeof(*sf_ctx) + cpumask_size(), GFP_KERNEL); if (!sf_ctx) return -ENOMEM; @@ -426,6 +464,9 @@ void offline_soft_domain(struct task_group *tg) struct soft_domain_ctx *sf_ctx = NULL; struct soft_domain_ctx *psf_ctx = NULL; + if (!soft_domain_enabled()) + return; + sf_ctx = tg->sf_ctx; psf_ctx = tg->parent->sf_ctx; @@ -446,6 +487,9 @@ void offline_soft_domain(struct task_group *tg) int destroy_soft_domain(struct task_group *tg) { + if (!soft_domain_enabled()) + return 0; + kfree(tg->sf_ctx); return 0; -- 2.18.0.huawei.25

Zhang Qiao

4:41 p.m.

New subject: [PATCH OLK-5.10 6/7] config: Configurate CONFIG_SCHED_SOFT_DOMAIN

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H -------------------------------- Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index be1faf2da008..7f377064ce0c 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -167,6 +167,7 @@ CONFIG_RT_GROUP_SCHED=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_SCHED_TASK_RELATIONSHIP=y CONFIG_QOS_SCHED_NUMA_ICON=y +CONFIG_SCHED_SOFT_DOMAIN=y CONFIG_QOS_SCHED_SMART_GRID=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 3582dbee5d72..db3e2d29b94e 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -172,6 +172,7 @@ CONFIG_RT_GROUP_SCHED=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y # CONFIG_SCHED_TASK_RELATIONSHIP is not set # CONFIG_QOS_SCHED_NUMA_ICON is not set +# CONFIG_SCHED_SOFT_DOMAIN is not set # CONFIG_QOS_SCHED_SMART_GRID is not set CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y -- 2.18.0.huawei.25

Zhang Qiao

4:41 p.m.

New subject: [PATCH OLK-5.10 7/7] sched: Balance newly task to soft domain

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H -------------------------------- Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/fair.c | 65 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56f407770be0..5a2fb1734dc2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7109,6 +7109,55 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +#ifdef CONFIG_SCHED_SOFT_DOMAIN +static inline bool sched_group_sf_preferred(struct task_struct *p, struct sched_group *group) +{ + struct soft_domain_ctx *ctx = NULL; + + if (!sched_feat(SOFT_DOMAIN)) + return true; + + ctx = task_group(p)->sf_ctx; + if (!ctx || ctx->policy == 0) + return true; + + if (!cpumask_intersects(sched_group_span(group), to_cpumask(ctx->span))) + return false; + + return true; +} + +static inline bool cpu_is_sf_preferred(struct task_struct *p, int cpu) +{ + struct soft_domain_ctx *ctx = NULL; + + if (!sched_feat(SOFT_DOMAIN)) + return true; + + ctx = task_group(p)->sf_ctx; + if (!ctx || ctx->policy == 0) + return true; + + if (!cpumask_test_cpu(cpu, to_cpumask(ctx->span))) + return false; + + return true; +} +#else + +static inline bool sched_group_sf_preferred(struct task_struct *p, struct sched_group *group) +{ + return true; +} + +static inline bool cpu_is_sf_preferred(struct task_struct *p, int cpu) +{ + return true; +} + +#endif + + /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. */ @@ -7137,6 +7186,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this if (!sched_core_cookie_match(rq, p)) continue; + if (!cpu_is_sf_preferred(p, i)) + continue; + if (sched_idle_cpu(i)) return i; @@ -8240,7 +8292,7 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, #endif #ifdef CONFIG_SCHED_SOFT_DOMAIN -static int wake_soft_domain(struct task_struct *p, int target) +static int wake_soft_domain(struct task_struct *p, int target, int *cpu, int sd_flags) { struct cpumask *mask = this_cpu_cpumask_var_ptr(select_idle_mask); struct soft_domain_ctx *ctx = NULL; @@ -8256,10 +8308,13 @@ static int wake_soft_domain(struct task_struct *p, int target) #endif cpumask_and(mask, mask, cpu_active_mask); if (cpumask_empty(mask) || cpumask_test_cpu(target, mask)) - goto out; + goto prefer; else target = cpumask_any_and_distribute(mask, mask); +prefer: + if (sd_flags & SD_BALANCE_FORK) + *cpu = target; out: return target; @@ -8331,7 +8386,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f #ifdef CONFIG_SCHED_SOFT_DOMAIN if (sched_feat(SOFT_DOMAIN)) - new_cpu = prev_cpu = wake_soft_domain(p, prev_cpu); + new_cpu = prev_cpu = wake_soft_domain(p, prev_cpu, &cpu, sd_flag); #endif #ifdef CONFIG_BPF_SCHED if (bpf_sched_enabled()) { @@ -11321,6 +11376,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) continue; + /* Skip over this group if not in soft domain */ + if (!sched_group_sf_preferred(p, group)) + continue; + local_group = cpumask_test_cpu(this_cpu, sched_group_span(group)); -- 2.18.0.huawei.25

187

Age (days ago)

187

Last active (days ago)

List overview

8 comments

2 participants

participants (2)

patchwork bot
Zhang Qiao

[PATCH OLK-5.10 0/7] Introduce soft domain

tags

participants (2)