Backport smart grid form OLK-5.10 to openEuler-1.0-LTS
Yipeng Zou (6): sched: introduce smart grid qos zone smart_grid: introduce /proc/pid/smart_grid_level smart_grid: introduce smart_grid_strategy_ctrl sysctl smart_grid: cpufreq: introduce smart_grid cpufreq control smart_grid: introducing rebuild_affinity_domain sched: smart_grid: silence complier error
drivers/cpufreq/cpufreq.c | 236 +++++++++++++++++++++++++++++++++ fs/proc/base.c | 78 +++++++++++ include/linux/cpufreq.h | 12 ++ include/linux/sched/grid_qos.h | 33 ++++- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 26 ++++ kernel/sched/fair.c | 57 +++++++- kernel/sched/grid/qos.c | 123 +++++++++++++++++ kernel/sched/grid/stat.c | 15 +++ kernel/sched/sched.h | 2 + kernel/sysctl.c | 9 ++ 11 files changed, 586 insertions(+), 6 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/11238 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/O...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/11238 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/O...
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZBSR CVE: NA
----------------------------------------
Since commit b869720191ec ("sched: smart grid: init sched_grid_qos structure on QOS purpose") introduced a smart_grid-based QOS partitioningmechanism, this commit further expands the partitioning mechanism to implement smart_grid zone.
In the default configuration smart_grid the entire system is divided into two partitions:
1. Hot zone (performance first) 2. Warm zone (energy consumption priority)
In addition, the smart_grid will dynamically maintain the size of the hot zone in the current system based on the task load status in the current partition, which based on commit 65523f55989a ("sched: Introduce smart grid scheduling strategy for cfs").
-------- -------- -------- | group0 | | group1 | | group2 | -------- -------- -------- | | | v v v ------------------------- -------------- | | | | | hot zone | | warm zone | | | | | ------------------------- ---------------
Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- include/linux/sched/grid_qos.h | 21 +++++++++ kernel/sched/core.c | 6 +++ kernel/sched/fair.c | 7 +++ kernel/sched/grid/qos.c | 83 ++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 5 files changed, 118 insertions(+)
diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h index 93f663453e16..f41fc1d83e55 100644 --- a/include/linux/sched/grid_qos.h +++ b/include/linux/sched/grid_qos.h @@ -84,7 +84,28 @@ void sched_grid_qos_free(struct task_struct *p);
int sched_grid_preferred_interleave_nid(struct mempolicy *policy); int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask); + +enum sg_zone_type { + SMART_GRID_ZONE_HOT = 0, + SMART_GRID_ZONE_WARM, + SMART_GRID_ZONE_NR +}; + +struct auto_affinity; +struct sched_grid_zone { + raw_spinlock_t lock; + struct cpumask cpus[SMART_GRID_ZONE_NR]; + struct list_head af_list_head; /* struct auto_affinity list head */ +}; + +int __init sched_grid_zone_init(void); +int sched_grid_zone_update(bool is_locked); +int sched_grid_zone_add_af(struct auto_affinity *af); +int sched_grid_zone_del_af(struct auto_affinity *af); +struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone); #else +static inline int __init sched_grid_zone_init(void) { return 0; } + static inline int sched_grid_preferred_interleave_nid(struct mempolicy *policy) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe9f91f39e2f..4e2f95d05896 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -18,6 +18,7 @@ #include "../../fs/io-wq.h" #include "../smpboot.h"
+#include <linux/sched/grid_qos.h> #include "pelt.h"
#define CREATE_TRACE_POINTS @@ -5977,6 +5978,7 @@ void __init sched_init_smp(void)
sched_smp_initialized = true;
+ sched_grid_zone_init(); init_auto_affinity(&root_task_group); }
@@ -7096,6 +7098,10 @@ static int cpu_affinity_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "dcount %d\n", ad->dcount); seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask); seq_printf(sf, "curr_level %d\n", ad->curr_level); + seq_printf(sf, "zone hot %*pbl\n", + cpumask_pr_args(sched_grid_zone_cpumask(SMART_GRID_ZONE_HOT))); + seq_printf(sf, "zone warm %*pbl\n", + cpumask_pr_args(sched_grid_zone_cpumask(SMART_GRID_ZONE_WARM))); for (i = 0; i < ad->dcount; i++) seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n", i, cpumask_pr_args(ad->domains[i]), diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 63f4344ac344..b0e0b17137ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5363,6 +5363,7 @@ static void affinity_domain_up(struct task_group *tg) if (IS_DOMAIN_SET(level + 1, ad->domain_mask) && cpumask_weight(ad->domains[level + 1]) > 0) { ad->curr_level = level + 1; + sched_grid_zone_update(false); return; } level++; @@ -5383,6 +5384,7 @@ static void affinity_domain_down(struct task_group *tg)
if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) { ad->curr_level = level - 1; + sched_grid_zone_update(false); return; } level--; @@ -5454,6 +5456,7 @@ static int tg_update_affinity_domain_down(struct task_group *tg, void *data) }
} + sched_grid_zone_update(false); raw_spin_unlock_irqrestore(&auto_affi->lock, flags);
return 0; @@ -5512,6 +5515,7 @@ void stop_auto_affinity(struct auto_affinity *auto_affi) raw_spin_unlock_irq(&auto_affi->lock);
smart_grid_usage_dec(); + sched_grid_zone_update(false); mutex_unlock(&smart_grid_used_mutex); }
@@ -5713,6 +5717,8 @@ int init_auto_affinity(struct task_group *tg)
auto_affi->tg = tg; tg->auto_affinity = auto_affi; + INIT_LIST_HEAD(&auto_affi->af_list); + sched_grid_zone_add_af(auto_affi); return 0; }
@@ -5727,6 +5733,7 @@ static void destroy_auto_affinity(struct task_group *tg) smart_grid_usage_dec();
hrtimer_cancel(&auto_affi->period_timer); + sched_grid_zone_del_af(auto_affi); free_affinity_domains(&auto_affi->ad);
kfree(tg->auto_affinity); diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c index b3df69d91499..13e36d269f36 100644 --- a/kernel/sched/grid/qos.c +++ b/kernel/sched/grid/qos.c @@ -22,6 +22,7 @@ #include <linux/sched.h> #include <linux/sched/grid_qos.h> #include "internal.h" +#include <../kernel/sched/sched.h>
static inline int qos_affinity_set(struct task_struct *p) { @@ -148,3 +149,85 @@ int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
return nd; } + +static struct sched_grid_zone sg_zone; + +int __init sched_grid_zone_init(void) +{ + int index; + + for (index = 0; index < SMART_GRID_ZONE_NR; index++) + cpumask_clear(&sg_zone.cpus[index]); + + raw_spin_lock_init(&sg_zone.lock); + INIT_LIST_HEAD(&sg_zone.af_list_head); + return 0; +} + +int sched_grid_zone_update(bool is_locked) +{ + struct list_head *pos; + struct auto_affinity *af_pos; + unsigned long flags; + + if (!is_locked) + raw_spin_lock_irqsave(&sg_zone.lock, flags); + + cpumask_clear(&sg_zone.cpus[SMART_GRID_ZONE_HOT]); + + list_for_each(pos, &sg_zone.af_list_head) { + af_pos = list_entry(pos, struct auto_affinity, af_list); + + /* when smart_grid not used we need calculate all task_group */ + /* when smart_grid used we only calculate enabled task_group */ + if (smart_grid_used() && af_pos->mode == 0) + continue; + + cpumask_or(&sg_zone.cpus[SMART_GRID_ZONE_HOT], &sg_zone.cpus[SMART_GRID_ZONE_HOT], + af_pos->ad.domains[af_pos->ad.curr_level]); + } + + cpumask_complement(&sg_zone.cpus[SMART_GRID_ZONE_WARM], + &sg_zone.cpus[SMART_GRID_ZONE_HOT]); + + if (!is_locked) + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + + return 0; +} + +int sched_grid_zone_add_af(struct auto_affinity *af) +{ + unsigned long flags; + + if (af == NULL) + return -1; + + raw_spin_lock_irqsave(&sg_zone.lock, flags); + list_add_tail(&af->af_list, &sg_zone.af_list_head); + sched_grid_zone_update(true); + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + return 0; +} + +int sched_grid_zone_del_af(struct auto_affinity *af) +{ + unsigned long flags; + + if (af == NULL) + return -1; + + raw_spin_lock_irqsave(&sg_zone.lock, flags); + list_del(&af->af_list); + sched_grid_zone_update(true); + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + return 0; +} + +struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone) +{ + if (zone >= SMART_GRID_ZONE_NR) + return NULL; + + return &sg_zone.cpus[zone]; +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 80e9d254ab7c..2dec32a61de0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -391,6 +391,7 @@ struct auto_affinity { int period_active; struct affinity_domain ad; struct task_group *tg; + struct list_head af_list; #endif };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZBSR CVE: NA
----------------------------------------
Now, we can use /pro/pid/smart_grid_level to {read,set} task current qos level.
This allows to determine the scope of dynamic partitioning of the task in smart_grid.
SCHED_GRID_QOS_TASK_LEVEL was defined different QoS level. The lower number has the higher priority. (E.g. 0 was the highest).
Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- fs/proc/base.c | 78 ++++++++++++++++++++++++++++++++++ include/linux/sched/grid_qos.h | 11 ++++- kernel/sched/grid/stat.c | 15 +++++++ 3 files changed, 103 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index dc9841826264..a76af2117c2c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -98,6 +98,10 @@ #include <trace/events/oom.h> #include "internal.h" #include "fd.h" +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include <linux/sched/grid_qos.h> +#include <linux/sched.h> +#endif
#include "../../lib/kstrtox.h"
@@ -3042,6 +3046,77 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */
+#ifdef CONFIG_QOS_SCHED_SMART_GRID +static int smart_grid_level_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p->_resvd != NULL && p->_resvd->grid_qos != NULL) + seq_printf(m, "%d\n", p->_resvd->grid_qos->stat.class_lvl); + + put_task_struct(p); + + return 0; +} + +static int smart_grid_level_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, smart_grid_level_show, inode); +} + +static ssize_t smart_grid_level_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + unsigned int level = SCHED_GRID_QOS_TASK_LEVEL_MAX; + int ret = 0; + + memset(buffer, 0, sizeof(buffer)); + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (kstrtouint(buffer, 10, &level)) { + put_task_struct(p); + return -EINVAL; + } + + if (level >= SCHED_GRID_QOS_TASK_LEVEL_MAX) { + put_task_struct(p); + return -EINVAL; + } + + if (p->_resvd != NULL && p->_resvd->grid_qos != NULL && + p->_resvd->grid_qos->stat.set_class_lvl != NULL) + ret = p->_resvd->grid_qos->stat.set_class_lvl(&p->_resvd->grid_qos->stat, level); + + put_task_struct(p); + + if (ret) + return ret; + return count; +} + +static const struct file_operations proc_pid_sg_level_operations = { + .open = smart_grid_level_open, + .read = seq_read, + .write = smart_grid_level_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Thread groups */ @@ -3065,6 +3140,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + REG("smart_grid_level", 0644, proc_pid_sg_level_operations), +#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h index f41fc1d83e55..3942735144a0 100644 --- a/include/linux/sched/grid_qos.h +++ b/include/linux/sched/grid_qos.h @@ -17,6 +17,15 @@ enum sched_grid_qos_class { SCHED_GRID_QOS_CLASS_LEVEL_NR };
+/* + * SCHED_GRID_QOS_TASK_LEVEL was defined different QoS level. + * The lower number has the higher priority. (E.g. 0 was the highest) + * The enum sched_grid_qos_class defined the max level, the lowest level. + */ +#define SCHED_GRID_QOS_TASK_LEVEL_HIGHEST SCHED_GRID_QOS_CLASS_LEVEL_1 +#define SCHED_GRID_QOS_TASK_LEVEL_MAX (SCHED_GRID_QOS_CLASS_LEVEL_NR) +#define SCHED_GRID_QOS_TASK_LEVEL_DEFAULT (SCHED_GRID_QOS_CLASS_LEVEL_NR - 1) + enum { SCHED_GRID_QOS_IPS_INDEX = 0, SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1, @@ -50,7 +59,7 @@ struct sched_grid_qos_sample {
struct sched_grid_qos_stat { enum sched_grid_qos_class class_lvl; - int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat); + int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat, int level); struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR]; };
diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c index b40c75145608..68bbc060b811 100644 --- a/kernel/sched/grid/stat.c +++ b/kernel/sched/grid/stat.c @@ -19,8 +19,20 @@ #include <linux/sched/grid_qos.h> #include "internal.h"
+static int qos_stat_set_class_level(struct sched_grid_qos_stat *qos_stat, int level) +{ + if (qos_stat == NULL || level >= SCHED_GRID_QOS_TASK_LEVEL_MAX) + return -EINVAL; + + qos_stat->class_lvl = level; + return 0; +} + void qos_stat_init(struct sched_grid_qos_stat *stat) { + if (stat == NULL) + return; + stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips"; stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX; stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio"; @@ -29,4 +41,7 @@ void qos_stat_init(struct sched_grid_qos_stat *stat) stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width"; stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index = SCHED_GRID_QOS_MEMBANDWIDTH_INDEX; + + stat->set_class_lvl = qos_stat_set_class_level; + stat->class_lvl = SCHED_GRID_QOS_TASK_LEVEL_DEFAULT; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZBSR CVE: NA
----------------------------------------
Considering for inheritance of the pre-verion code.
We make all task to the highest qos_level (grid_qos_level = 0),when smart_grid strategy was disabled.
Otherwise, When smart_grid strategy was enabled, we use the task's actually grid_qos_level.
Default smart_grid strategy was disable (=0).
Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- include/linux/sched/grid_qos.h | 1 + include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 5 +---- kernel/sched/grid/qos.c | 34 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 9 +++++++++ 5 files changed, 46 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h index 3942735144a0..fac28206d76b 100644 --- a/include/linux/sched/grid_qos.h +++ b/include/linux/sched/grid_qos.h @@ -112,6 +112,7 @@ int sched_grid_zone_update(bool is_locked); int sched_grid_zone_add_af(struct auto_affinity *af); int sched_grid_zone_del_af(struct auto_affinity *af); struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone); +struct cpumask *sched_grid_prefer_cpus(struct task_struct *p); #else static inline int __init sched_grid_zone_init(void) { return 0; }
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index ad472760e97d..caf4fd7cf68f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -37,6 +37,7 @@ extern int sysctl_sched_util_low_pct; #endif
#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern unsigned int sysctl_smart_grid_strategy_ctrl; extern int sysctl_affinity_adjust_delay_ms; #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b0e0b17137ea..c5318a44444e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5327,8 +5327,6 @@ static void smart_grid_usage_dec(void)
static inline struct cpumask *task_prefer_cpus(struct task_struct *p) { - struct affinity_domain *ad; - if (!smart_grid_used() || !task_group(p)->auto_affinity) return p->prefer_cpus; @@ -5336,8 +5334,7 @@ static inline struct cpumask *task_prefer_cpus(struct task_struct *p) if (task_group(p)->auto_affinity->mode == 0) return &p->cpus_allowed;
- ad = &task_group(p)->auto_affinity->ad; - return ad->domains[ad->curr_level]; + return sched_grid_prefer_cpus(p); }
static inline int dynamic_affinity_mode(struct task_struct *p) diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c index 13e36d269f36..87ebd5003640 100644 --- a/kernel/sched/grid/qos.c +++ b/kernel/sched/grid/qos.c @@ -231,3 +231,37 @@ struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone)
return &sg_zone.cpus[zone]; } + +/* + * Default smart_grid strategy was disable (=0). + * But, considering for inheritance of the pre-verion code. + * We make all the task to the highest qos_level (class_lvl = 0), + * when smart_grid strategy was disabled. + * Otherwise, When smart_grid strategy was enabled, we use the task's + * actually class_lvl. + */ +unsigned int sysctl_smart_grid_strategy_ctrl; + +struct cpumask *sched_grid_prefer_cpus(struct task_struct *p) +{ + struct affinity_domain *ad; + enum sg_zone_type current_zone; + + ad = &task_group(p)->auto_affinity->ad; + /* + * when smart_grid strategy was disabled, + * We make all the task to the highest qos_level (class_lvl = 0) + */ + if (sysctl_smart_grid_strategy_ctrl == 0) + return ad->domains[ad->curr_level]; + + /* Only place the highest level task into hot zone */ + current_zone = p->_resvd->grid_qos->stat.class_lvl == SCHED_GRID_QOS_TASK_LEVEL_HIGHEST ? + SMART_GRID_ZONE_HOT : SMART_GRID_ZONE_WARM; + + /* Place the highest level task in current domain level itself */ + if (current_zone == SMART_GRID_ZONE_HOT) + return ad->domains[ad->curr_level]; + + return &sg_zone.cpus[current_zone]; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f64c4495bc54..1e6be2dfef45 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1343,6 +1343,15 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_QOS_SCHED_SMART_GRID + { + .procname = "smart_grid_strategy_ctrl", + .data = &sysctl_smart_grid_strategy_ctrl, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "affinity_adjust_delay_ms", .data = &sysctl_affinity_adjust_delay_ms,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZBSR CVE: NA
----------------------------------------
In smart_grid zone, the system is divided into multiple zones. The typical configuration is one hot zone and the other warm zone.
This commit can configure an independent cpufreq governor for each zone. After the zone range in the system changes dynamically, it can Automatically configure the corresponding cpufreq governor.
The default configuration of hot zone is performance governor, and the default configuration of warm zone is powersave governor.
-------- -------- -------- | group0 | | group1 | | group2 | -------- -------- -------- | | | v v v --------------------------- ----------------- | | | | | hot zone | | warm zone | | | | | --------------------------- ----------------- ^ ^ | | ------------- ---------- | performance | <- cpufreq governor -> |powersave | ------------ ----------
Introduce two attributes in /sys/devices/system/cpu/cpufreq:
1. smart_grid_governor_enable
For smart_grid governor, set 1 for enable, 0 for disable, read to get current status.
For Notice, if smart_grid governor is enabled the governor of cpu may auto switch by smart_grid, so if user wants to config governor by itself smart_grid_governor shoudle be disabled.
2. smart_grid_governor
Only can set when smart_grid_governor has been enabled.
Set {level}-{governor name} formate to change each zone's governor:
0-performance will set all hot zone cpu's governor to performance. 1-powersave will set all warm zone cpu's governor to powersave.
Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- drivers/cpufreq/cpufreq.c | 236 ++++++++++++++++++++++++++++++++++++++ include/linux/cpufreq.h | 12 ++ kernel/sched/grid/qos.c | 10 +- 3 files changed, 256 insertions(+), 2 deletions(-)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 4bd9fc287c82..7656f45bb851 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2444,6 +2444,239 @@ int cpufreq_boost_enabled(void) } EXPORT_SYMBOL_GPL(cpufreq_boost_enabled);
+#ifdef CONFIG_QOS_SCHED_SMART_GRID + +struct smart_grid_zone { + char governor_name[SMART_GRID_ZONE_NR][CPUFREQ_NAME_LEN]; + unsigned int enable; + struct irq_work irq_work; + struct work_struct work; + unsigned int is_init; +}; + +static struct smart_grid_zone sg_zone; +static DEFINE_MUTEX(sg_zone_lock); + +#define SG_WRITE_BUFF_LEN 30 + +void cpufreq_smart_grid_start_sync(void) +{ + if (likely(sg_zone.is_init)) + irq_work_queue(&sg_zone.irq_work); +} + +static ssize_t show_smart_grid_governor(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int len = 0; + int gov_index; + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + mutex_unlock(&sg_zone_lock); + return sprintf(buf, "smart_grid governor disable\n"); + } + + for (gov_index = 0; gov_index < SMART_GRID_ZONE_NR; gov_index++) + len += sprintf(buf + len, "smart_grid-%d: %s\n", gov_index, + sg_zone.governor_name[gov_index]); + + mutex_unlock(&sg_zone_lock); + return len; +} + +static ssize_t store_smart_grid_governor(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int current_level; + char *level_string = NULL; + char buf_string[SG_WRITE_BUFF_LEN]; + char *gov_string = buf_string; + char save_string[CPUFREQ_NAME_LEN]; + int ret; + struct cpufreq_policy new_policy; + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + ret = -EINVAL; + goto fail; + } + + if (strscpy(buf_string, buf, SG_WRITE_BUFF_LEN) <= 0) { + ret = -EINVAL; + goto fail; + } + + level_string = strsep(&gov_string, "-"); + if (level_string == NULL) { + ret = -EINVAL; + goto fail; + } + + if (kstrtouint(level_string, 10, ¤t_level)) { + ret = -EINVAL; + goto fail; + } + + if (current_level >= SMART_GRID_ZONE_NR) { + ret = -EINVAL; + goto fail; + } + + if (sscanf(gov_string, "%15s", save_string) != 1) { + ret = -EINVAL; + goto fail; + } + + if (cpufreq_parse_governor(save_string, &new_policy)) { + ret = -EINVAL; + goto fail; + } + if (new_policy.governor) + module_put(new_policy.governor->owner); + + strscpy(sg_zone.governor_name[current_level], save_string, CPUFREQ_NAME_LEN); + cpufreq_smart_grid_start_sync(); + mutex_unlock(&sg_zone_lock); + return count; + +fail: + mutex_unlock(&sg_zone_lock); + return ret; +} +define_one_global_rw(smart_grid_governor); + +static ssize_t show_smart_grid_governor_enable(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", sg_zone.enable); +} + +static void smart_grid_irq_work(struct irq_work *irq_work) +{ + struct smart_grid_zone *zone; + + zone = container_of(irq_work, struct smart_grid_zone, irq_work); + schedule_work_on(smp_processor_id(), &zone->work); +} + +static void smart_grid_work_handler(struct work_struct *work) +{ + struct smart_grid_zone *zone; + struct cpufreq_policy *policy = NULL; + unsigned int cpu; + int gov_index; + struct cpufreq_policy new_policy; + + zone = container_of(work, struct smart_grid_zone, work); + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + mutex_unlock(&sg_zone_lock); + return; + } + + /* + * Because of the policy may be shared between hot and warm zone. + * We need to make sure hot zone have the highest priority. + */ + for (gov_index = SMART_GRID_ZONE_NR - 1; gov_index >= 0; gov_index--) { + if (cpufreq_parse_governor(sg_zone.governor_name[gov_index], &new_policy)) + continue; + + for_each_cpu(cpu, sched_grid_zone_cpumask(gov_index)) { + if (cpu_is_offline(cpu)) + continue; + + policy = cpufreq_cpu_get(cpu); + if (policy == NULL) + continue; + + if (policy->governor == new_policy.governor) { + cpufreq_cpu_put(policy); + continue; + } + /*Try to switch governor */ + store_scaling_governor(policy, sg_zone.governor_name[gov_index], + CPUFREQ_NAME_LEN); + cpufreq_cpu_put(policy); + } + if (new_policy.governor) + module_put(new_policy.governor->owner); + } + mutex_unlock(&sg_zone_lock); +} + +static void sg_zone_set_enable(void) +{ + int gov_index; + + /* Set default smart_grid governor */ + for (gov_index = 0; gov_index < SMART_GRID_ZONE_NR; gov_index++) { + if (!gov_index) + strscpy(sg_zone.governor_name[gov_index], "performance", CPUFREQ_NAME_LEN); + else + strscpy(sg_zone.governor_name[gov_index], "powersave", CPUFREQ_NAME_LEN); + } + + sg_zone.enable = 1; + cpufreq_smart_grid_start_sync(); +} + +static void sg_zone_set_disable(void) +{ + sg_zone.enable = 0; +} + +static ssize_t store_smart_grid_governor_enable(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int enable; + + if (kstrtouint(buf, 10, &enable)) + return -EINVAL; + + if (enable > 1) + return -EINVAL; + + mutex_lock(&sg_zone_lock); + if (sg_zone.enable == enable) { + mutex_unlock(&sg_zone_lock); + return -EINVAL; + } + + if (enable) + sg_zone_set_enable(); + else + sg_zone_set_disable(); + + mutex_unlock(&sg_zone_lock); + return count; +} +define_one_global_rw(smart_grid_governor_enable); + +static int create_smart_grid_sysfs_file(void) +{ + int ret; + + ret = sysfs_create_file(cpufreq_global_kobject, &smart_grid_governor.attr); + if (ret) + pr_err("%s: cannot register global smart_grid_governor sysfs file\n", + __func__); + + ret = sysfs_create_file(cpufreq_global_kobject, &smart_grid_governor_enable.attr); + if (ret) + pr_err("%s: cannot register global smart_grid_governor_enable sysfs file\n", + __func__); + + init_irq_work(&sg_zone.irq_work, smart_grid_irq_work); + INIT_WORK(&sg_zone.work, smart_grid_work_handler); + sg_zone.enable = 0; + sg_zone.is_init = 1; + return ret; +} +#endif + /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * *********************************************************************/ @@ -2603,6 +2836,9 @@ static int __init cpufreq_core_init(void) cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj); BUG_ON(!cpufreq_global_kobject);
+#ifdef CONFIG_QOS_SCHED_SMART_GRID + create_smart_grid_sysfs_file(); +#endif return 0; } module_param(off, int, 0444); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 3361663144a1..07df0924bced 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -18,6 +18,9 @@ #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/sysfs.h> +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include <linux/sched/grid_qos.h> +#endif
/********************************************************************* * CPUFREQ INTERFACE * @@ -529,6 +532,15 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy); int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor);
+#ifdef CONFIG_QOS_SCHED_SMART_GRID +/* Implement in cpufreq.c */ +#ifdef CONFIG_CPU_FREQ +void cpufreq_smart_grid_start_sync(void); +#else +static inline void cpufreq_smart_grid_start_sync(void) { return; } +#endif +#endif + struct cpufreq_governor *cpufreq_default_governor(void); struct cpufreq_governor *cpufreq_fallback_governor(void);
diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c index 87ebd5003640..dbf070c8cc58 100644 --- a/kernel/sched/grid/qos.c +++ b/kernel/sched/grid/qos.c @@ -174,6 +174,7 @@ int sched_grid_zone_update(bool is_locked) raw_spin_lock_irqsave(&sg_zone.lock, flags);
cpumask_clear(&sg_zone.cpus[SMART_GRID_ZONE_HOT]); + cpumask_clear(&sg_zone.cpus[SMART_GRID_ZONE_WARM]);
list_for_each(pos, &sg_zone.af_list_head) { af_pos = list_entry(pos, struct auto_affinity, af_list); @@ -185,14 +186,19 @@ int sched_grid_zone_update(bool is_locked)
cpumask_or(&sg_zone.cpus[SMART_GRID_ZONE_HOT], &sg_zone.cpus[SMART_GRID_ZONE_HOT], af_pos->ad.domains[af_pos->ad.curr_level]); + /* Update warm zone CPUs to max level first */ + cpumask_or(&sg_zone.cpus[SMART_GRID_ZONE_WARM], &sg_zone.cpus[SMART_GRID_ZONE_WARM], + af_pos->ad.domains[af_pos->ad.dcount - 1]); }
- cpumask_complement(&sg_zone.cpus[SMART_GRID_ZONE_WARM], - &sg_zone.cpus[SMART_GRID_ZONE_HOT]); + /* Then reset warm zone CPUs without hot zone CPUs */ + cpumask_andnot(&sg_zone.cpus[SMART_GRID_ZONE_WARM], &sg_zone.cpus[SMART_GRID_ZONE_WARM], + &sg_zone.cpus[SMART_GRID_ZONE_HOT]);
if (!is_locked) raw_spin_unlock_irqrestore(&sg_zone.lock, flags);
+ cpufreq_smart_grid_start_sync(); return 0; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9OJK9 CVE: NA
----------------------------------------
Here are many scenarios we tested with smart_grid, we found that the first domain level is key to the benchmark.
The reason is that there are many things such as interrupt affinity, memory affinity factor that can have a big impact on the test.
Before this patch, the first domain level is unchangeable after creation.
This patch introduce the 'cpu.rebuild_affinity_domain' to dynamically reconfigure all domain levels.
Typical use cases:
echo $cpu_id > cpu.rebuild_affinity_domain
The cpu_id means which cpu we want to set first level.
If we set cpu_id = 34, we can see some change like:
---------------- ----------------- | level 0 (0-31) | | level 0 (32-63) | ---------------- ----------------- v v ------------------- ------------------ | level 1 (0-63) | | level 1 (0-63) | ------------------- ------------------ v --> v --------------------- -------------------- | level 2 (0-95) | | level 2 (0-95) | --------------------- -------------------- v v ------------------------ ---------------------- | level 3 (0-127) | | level 3 (0-127) | ------------------------ ----------------------
There are number of constraints on the rebuild feature:
1. Only rebuild domain while auto mode disabled. (cpu.dynamic_affinity_mode == 1) 2. Only rebuild on active and housekeeping cpu. (Offline and isolate CPUs are forbidden) 3. This file is write only.
Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- kernel/sched/core.c | 13 +++++++++++++ kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 57 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4e2f95d05896..538a83cb1f22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7109,6 +7109,15 @@ static int cpu_affinity_stat_show(struct seq_file *sf, void *v)
return 0; } + +static int cpu_rebuild_affinity_domain_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 cpu) +{ + struct task_group *tg = css_tg(css); + + return tg_rebuild_affinity_domains(cpu, tg->auto_affinity); +} #endif /* CONFIG_QOS_SCHED_SMART_GRID */
#ifdef CONFIG_QOS_SCHED @@ -7236,6 +7245,10 @@ static struct cftype cpu_legacy_files[] = { .name = "affinity_stat", .seq_show = cpu_affinity_stat_show, }, + { + .name = "rebuild_affinity_domain", + .write_u64 = cpu_rebuild_affinity_domain_u64, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c5318a44444e..2656238af627 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5736,6 +5736,49 @@ static void destroy_auto_affinity(struct task_group *tg) kfree(tg->auto_affinity); tg->auto_affinity = NULL; } + +int tg_rebuild_affinity_domains(int cpu, struct auto_affinity *auto_affi) +{ + int ret = 0; + int level = 0; + struct sched_domain *tmp; + + if (unlikely(!auto_affi)) + return -EPERM; + + mutex_lock(&smart_grid_used_mutex); + raw_spin_lock_irq(&auto_affi->lock); + /* Only build domain while auto mode disabled */ + if (auto_affi->mode) { + ret = -EPERM; + goto unlock_all; + } + + /* Only build on active and housekeeping cpu */ + if (!cpu_active(cpu) || !housekeeping_cpu(cpu, HK_FLAG_DOMAIN)) { + ret = -EINVAL; + goto unlock_all; + } + + for_each_domain(cpu, tmp) { + if (!auto_affi->ad.domains[level] || !auto_affi->ad.domains_orig[level]) + continue; + + /* rebuild domain[,_orig] and reset schedstat counter */ + cpumask_copy(auto_affi->ad.domains[level], sched_domain_span(tmp)); + cpumask_copy(auto_affi->ad.domains_orig[level], auto_affi->ad.domains[level]); + __schedstat_set(auto_affi->ad.stay_cnt[level], 0); + level++; + } + + /* trigger to update smart grid zone */ + sched_grid_zone_update(false); + +unlock_all: + raw_spin_unlock_irq(&auto_affi->lock); + mutex_unlock(&smart_grid_used_mutex); + return ret; +} #else static void destroy_auto_affinity(struct task_group *tg) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2dec32a61de0..4322122c44cb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -519,6 +519,7 @@ extern void start_auto_affinity(struct auto_affinity *auto_affi); extern void stop_auto_affinity(struct auto_affinity *auto_affi); extern int init_auto_affinity(struct task_group *tg); extern void tg_update_affinity_domains(int cpu, int online); +extern int tg_rebuild_affinity_domains(int cpu, struct auto_affinity *auto_affi);
#else static inline int init_auto_affinity(struct task_group *tg)
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9PEF6 CVE: NA
----------------------------------------
Silence complier error with allnoconfig.
Implicit declaration of function without CONFIG_CGROUP_SCHED: 1. tg_update_affinity_domains 2. init_auto_affinity
Fixes: 713cfd2684fa ("sched: Introduce smart grid scheduling strategy for cfs") Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- kernel/sched/core.c | 7 +++++++ kernel/sched/fair.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 538a83cb1f22..0e332ca3244e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5843,7 +5843,9 @@ int sched_cpu_activate(unsigned int cpu) static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); +#ifdef CONFIG_QOS_SCHED_SMART_GRID tg_update_affinity_domains(cpu, 1); +#endif
if (sched_smp_initialized) { sched_domains_numa_masks_set(cpu); @@ -5906,7 +5908,9 @@ int sched_cpu_deactivate(unsigned int cpu) return ret; } sched_domains_numa_masks_clear(cpu); +#ifdef CONFIG_QOS_SCHED_SMART_GRID tg_update_affinity_domains(cpu, 0); +#endif return 0; }
@@ -5979,7 +5983,10 @@ void __init sched_init_smp(void) sched_smp_initialized = true;
sched_grid_zone_init(); + +#ifdef CONFIG_QOS_SCHED_SMART_GRID init_auto_affinity(&root_task_group); +#endif }
static int __init migration_init(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2656238af627..c293b1d1efa7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5780,7 +5780,7 @@ int tg_rebuild_affinity_domains(int cpu, struct auto_affinity *auto_affi) return ret; } #else -static void destroy_auto_affinity(struct task_group *tg) {} +static void __maybe_unused destroy_auto_affinity(struct task_group *tg) {}
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY static inline bool prefer_cpus_valid(struct task_struct *p);