
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC8X6H -------------------------------- In the previous patch, we have introduced soft domain. Now, we attach task group to soft domain, task will be preferentially scheduled into their associated soft scheduling domains during low-load periods. To enable the soft domain scheduling feature for a task group, we need to write '1' to the cpu.soft_domain file in the CPU cgroup subsystem. This operation will allocate sub-soft_domains matching the CPU quota of the cgroup(if cpu.cfs_quota_us is -1, Treat it as a 1) to the task group, subsequently establishing a preferred scheduling domain dedicated to this group. Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/core.c | 71 ++++++++++ kernel/sched/fair.c | 20 +++ kernel/sched/sched.h | 22 ++++ kernel/sched/soft_domain.c | 260 +++++++++++++++++++++++++++++++++++++ 4 files changed, 373 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b3092685f267..961a5f95b128 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11725,6 +11725,58 @@ static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN + +static int cpu_soft_domain_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, + s64 val) +{ + return sched_group_set_soft_domain(css_tg(css), val); +} + +static s64 cpu_soft_domain_read_s64(struct cgroup_subsys_state *css, + struct cftype *cftype) +{ + struct task_group *tg = css_tg(css); + + return (s64)tg->sf_ctx->policy; +} + +static int cpu_soft_domain_quota_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) +{ + struct task_group *tg = css_tg(css); + + if (tg->sf_ctx->policy != 0) + return -EINVAL; + + if (val > cpumask_weight(cpumask_of_node(0))) + return -EINVAL; + + tg->sf_ctx->nr_cpus = (int)val; + + return 0; +} + +static u64 cpu_soft_domain_quota_read_u64(struct cgroup_subsys_state *css, + struct cftype *cftype) +{ + struct task_group *tg = css_tg(css); + + return (u64)tg->sf_ctx->nr_cpus; +} + +static int soft_domain_cpu_list_seq_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + seq_printf(sf, "%*pbl\n", cpumask_pr_args(to_cpumask(tg->sf_ctx->span))); + + return 0; +} + +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11763,6 +11815,25 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_rebuild_affinity_domain_u64, }, #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN + { + .name = "soft_domain", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_soft_domain_read_s64, + .write_s64 = cpu_soft_domain_write_s64, + }, + { + .name = "soft_domain_nr_cpu", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_soft_domain_quota_read_u64, + .write_u64 = cpu_soft_domain_quota_write_u64, + }, + { + .name = "soft_domain_cpu_list", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = soft_domain_cpu_list_seq_show, + }, +#endif #ifdef CONFIG_CFS_BANDWIDTH { .name = "cfs_quota_us", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7df2b07629c0..775447f31a5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -14730,6 +14730,22 @@ void free_fair_sched_group(struct task_group *tg) kfree(tg->se); } +#ifdef CONFIG_SCHED_SOFT_DOMAIN +int init_soft_domain(struct task_group *tg) +{ + struct soft_domain_ctx *sf_ctx = NULL; + + sf_ctx = kzalloc(sizeof(*sf_ctx) + cpumask_size(), GFP_KERNEL); + if (!sf_ctx) + return -ENOMEM; + + sf_ctx->policy = 0; + tg->sf_ctx = sf_ctx; + + return 0; +} +#endif + int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; @@ -14750,6 +14766,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (ret) goto err; + ret = init_soft_domain(tg); + if (ret) + goto err; + for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 056a680ae9ed..0dc1fccde30b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -404,6 +404,16 @@ struct auto_affinity { }; #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN + +struct soft_domain_ctx { + int policy; + int nr_cpus; + struct soft_domain *sf_d; + unsigned long span[]; +}; +#endif + /* Task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -469,7 +479,11 @@ struct task_group { struct auto_affinity *auto_affinity; #endif +#ifdef CONFIG_SCHED_SOFT_DOMAIN + KABI_USE(1, struct soft_domain_ctx *sf_ctx) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -3736,6 +3750,10 @@ bool bpf_sched_is_cpu_allowed(struct task_struct *p, int cpu); #ifdef CONFIG_SCHED_SOFT_DOMAIN void build_soft_domain(void); +int init_soft_domain(struct task_group *tg); + +int sched_group_set_soft_domain(struct task_group *tg, long val); + static inline struct cpumask *soft_domain_span(unsigned long span[]) { return to_cpumask(span); @@ -3743,6 +3761,10 @@ static inline struct cpumask *soft_domain_span(unsigned long span[]) #else static inline void build_soft_domain(void) { } +static inline int init_soft_domain(struct task_group *tg) +{ + return 0; +} #endif diff --git a/kernel/sched/soft_domain.c b/kernel/sched/soft_domain.c index 1be52b056cad..11c2ae1fab95 100644 --- a/kernel/sched/soft_domain.c +++ b/kernel/sched/soft_domain.c @@ -15,6 +15,7 @@ * */ +#include <linux/sort.h> static DEFINE_PER_CPU(struct soft_domain *, g_sf_d); @@ -111,3 +112,262 @@ void build_soft_domain(void) out: rcu_read_unlock(); } + +static DEFINE_MUTEX(soft_domain_mutex); + +#define NR_MAX_CLUSTER 16 + +struct domain_node { + struct soft_subdomain *sud_d; + unsigned int attached; + unsigned long util; +}; + +static int subdomain_cmp(const void *a, const void *b) +{ + struct domain_node *ca = (struct domain_node *)a; + struct domain_node *cb = (struct domain_node *)b; + + if (ca->attached < cb->attached || + (ca->attached == cb->attached && ca->util < cb->util)) + return -1; + + return 1; +} + +struct soft_domain_args { + int policy; + struct cpumask *cpus; +}; + +static int tg_set_soft_domain(struct task_group *tg, void *data) +{ + struct soft_domain_args *args = (struct soft_domain_args *)data; + + tg->sf_ctx->policy = args->policy; + if (args->policy) + cpumask_copy(to_cpumask(tg->sf_ctx->span), args->cpus); + else + cpumask_clear(to_cpumask(tg->sf_ctx->span)); + + return 0; +} + +static int __calc_cpu(struct task_group *tg) +{ + int nr_cpu = 1; + + if (tg->sf_ctx->nr_cpus) + nr_cpu = tg->sf_ctx->nr_cpus; +#ifdef CONFIG_CFS_BANDWIDTH + else if (tg->cfs_bandwidth.quota != RUNTIME_INF) + nr_cpu = max(1, tg->cfs_bandwidth.quota / tg->cfs_bandwidth.period); +#endif + + tg->sf_ctx->nr_cpus = nr_cpu; + + return nr_cpu; +} + +static unsigned long sum_util(struct cpumask *mask) +{ + unsigned long sum = 0; + int cpu; + + for_each_cpu(cpu, mask) + sum += cpu_util_cfs(cpu); + + return sum; +} + +static int __check_policy(struct task_group *tg, void *data) +{ + return !!tg->sf_ctx->policy; +} + +static int check_policy(struct task_group *tg, long policy) +{ + int ret; + + rcu_read_lock(); + ret = walk_tg_tree_from(tg, __check_policy, tg_nop, NULL); + rcu_read_unlock(); + + return ret; +} + +static struct soft_domain *find_idlest_llc(long policy, + int nr_cpu, cpumask_var_t cpus) +{ + int cpu; + int max_cpu = 0; + struct soft_domain *idlest = NULL; + + /* The user has specified the llc. */ + if (policy > 0) { + cpu = cpumask_first(cpumask_of_node(policy-1)); + idlest = rcu_dereference(per_cpu(g_sf_d, cpu)); + return idlest; + } + + cpumask_copy(cpus, cpu_active_mask); + for_each_cpu(cpu, cpus) { + struct soft_domain *sf_d = NULL; + unsigned long min_util = ULONG_MAX; + + sf_d = rcu_dereference(per_cpu(g_sf_d, cpu)); + if (sf_d == NULL) + continue; + + /* + * LLC selection order: + * 1. When the number of idle cpus meet the requirements, + * the one with more idles cpus is better; + * 2. Under the condition of insufficient idle cpus, util + * is lower, the better. + */ + if (sf_d->nr_available_cpus > max_cpu && + nr_cpu <= sf_d->nr_available_cpus) { + max_cpu = sf_d->nr_available_cpus; + idlest = sf_d; + } else if (max_cpu == 0) { /* No llc meets the demand */ + unsigned long util = sum_util(to_cpumask(sf_d->span)); + + if (idlest == NULL || util < min_util) { + idlest = sf_d; + min_util = util; + } + } + + cpumask_andnot(cpus, cpus, to_cpumask(sf_d->span)); + } + + return idlest; +} + +static int __sched_group_set_soft_domain(struct task_group *tg, long policy) +{ + int cpu; + int ret = 0; + cpumask_var_t cpus; + int nr_cpu = __calc_cpu(tg); + struct soft_domain_args args; + struct domain_node nodes[NR_MAX_CLUSTER] = {0}; + + if (check_policy(tg, policy)) + return -EINVAL; + + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return -EINVAL; + + scoped_guard (cpus_read_lock) { + struct soft_domain *sf_d = NULL; + + rcu_read_lock(); + /* 1. Find a idlest llc. */ + sf_d = find_idlest_llc(policy, nr_cpu, cpus); + if (sf_d != NULL) { + /* 2. select idlest clusters. */ + struct list_head *children = &sf_d->child_domain; + struct soft_subdomain *sub_d = NULL; + int nr = 0, i; + struct cpumask *tmpmask = NULL; + int tmp_cpu = nr_cpu; + + list_for_each_entry(sub_d, children, node) { + nodes[nr].sud_d = sub_d; + nodes[nr].attached = sub_d->attached; + tmpmask = to_cpumask(sub_d->span); + cpu = cpumask_first(tmpmask); + nodes[nr].util = sum_util(tmpmask); + nr++; + } + + cpumask_clear(cpus); + + sort(nodes, nr, sizeof(struct domain_node), subdomain_cmp, NULL); + sf_d->nr_available_cpus -= min(sf_d->nr_available_cpus, tmp_cpu); + for (i = 0; i < nr; i++) { + sub_d = nodes[i].sud_d; + tmpmask = to_cpumask(sub_d->span); + cpumask_or(cpus, cpus, tmpmask); + sub_d->attached++; + nr_cpu -= cpumask_weight(tmpmask); + if (nr_cpu <= 0) + break; + } + + /* 3. attach task group to softdomain. */ + args.policy = policy; + args.cpus = cpus; + walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args); + + /* + * 4. TODO + * add tg to llc domain task_groups list for load balance. + */ + tg->sf_ctx->sf_d = sf_d; + } else { + ret = -EINVAL; + } + rcu_read_unlock(); + } + + free_cpumask_var(cpus); + + return ret; +} + +static int __sched_group_unset_soft_domain(struct task_group *tg) +{ + struct soft_domain_args args = { + .policy = 0, + }; + struct soft_domain *sf_d = NULL; + struct soft_subdomain *sub_d = NULL; + struct list_head *children = NULL; + + /* If parent has set soft domain, child group can't unset itself. */ + if (tg->parent->sf_ctx->policy != 0) + return -EINVAL; + + sf_d = tg->sf_ctx->sf_d; + sf_d->nr_available_cpus += __calc_cpu(tg); + children = &sf_d->child_domain; + + list_for_each_entry(sub_d, children, node) { + if (cpumask_intersects(to_cpumask(tg->sf_ctx->span), to_cpumask(sub_d->span))) + sub_d->attached--; + } + + walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args); + + return 0; +} + +int sched_group_set_soft_domain(struct task_group *tg, long val) +{ + int ret = 0; + + if (val < -1 || val > nr_node_ids) + return -EINVAL; + + mutex_lock(&soft_domain_mutex); + + /* If enable or disable is repeated, directly return. */ + if (!!tg->sf_ctx->policy == !!val) + goto out; + + if (val == 0) + ret = __sched_group_unset_soft_domain(tg); + else + ret = __sched_group_set_soft_domain(tg, val); + + if (!ret) + tg->sf_ctx->policy = val; + +out: + mutex_unlock(&soft_domain_mutex); + + return ret; +} -- 2.18.0.huawei.25