hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YS6M
-------------------------------
Expand qos_level from {-1,0} to [-2, 2], to distinguish the tasks expected to be with extremely high or low priority level. Using qos_level_weight to reweight the shares when calculating group's weight. Meanwhile, set offline task's schedule policy to SCHED_IDLE so that it can be preempted at check_preempt_wakeup.
Signed-off-by: Zhao Wenhui zhaowenhui8@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/sched/sysctl.h | 4 ++ init/Kconfig | 9 ++++ kernel/sched/core.c | 24 ++++++---- kernel/sched/fair.c | 64 ++++++++++++++++++++++++-- kernel/sched/sched.h | 33 +++++++++++++ kernel/sysctl.c | 9 ++++ 8 files changed, 133 insertions(+), 12 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 37dc1012c94d..0203ca36f600 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -177,6 +177,7 @@ CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_MISC is not set CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_MULTILEVEL=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_NAMESPACES=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index d750bf90ce57..3d91106faee5 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -186,6 +186,7 @@ CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_MULTILEVEL=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 28d9be8e4614..3a02a76b08ca 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -37,4 +37,8 @@ extern unsigned int sysctl_overload_detect_period; extern unsigned int sysctl_offline_wait_interval; #endif
+#ifdef CONFIG_QOS_SCHED_MULTILEVEL +extern unsigned int sysctl_qos_level_weights[]; +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/init/Kconfig b/init/Kconfig index d96c76143610..0bd8bb67e699 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -996,6 +996,15 @@ config QOS_SCHED
If in doubt, say N.
+config QOS_SCHED_MULTILEVEL + bool "Multiple qos level task scheduling" + depends on QOS_SCHED + default n + help + This feature enable multiple qos level on task scheduling. + Expand the qos_level to [-2,2] to distinguish the tasks expected + to be with extremely high or low priority level. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 652c06bd546d..238b5b55c38a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7689,7 +7689,7 @@ static int __sched_setscheduler(struct task_struct *p, * other than SCHED_IDLE, the online task preemption and cpu resource * isolation will be invalid, so return -EINVAL in this case. */ - if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { + if (unlikely(is_offline_level(task_group(p)->qos_level) && !idle_policy(policy))) { retval = -EINVAL; goto unlock; } @@ -10356,7 +10356,7 @@ static void sched_change_qos_group(struct task_struct *tsk, struct task_group *t */ if (!(tsk->flags & PF_EXITING) && !task_group_is_autogroup(tg) && - (tg->qos_level == -1)) { + (is_offline_level(tg->qos_level))) { attr.sched_priority = 0; attr.sched_policy = SCHED_IDLE; attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); @@ -10385,7 +10385,7 @@ void sched_move_offline_task(struct task_struct *p) { struct offline_args *args;
- if (unlikely(task_group(p)->qos_level != -1)) + if (unlikely(!is_offline_level(task_group(p)->qos_level))) return;
args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC); @@ -11275,7 +11275,7 @@ static int tg_change_scheduler(struct task_group *tg, void *data) struct cgroup_subsys_state *css = &tg->css;
tg->qos_level = qos_level; - if (qos_level == -1) + if (is_offline_level(qos_level)) policy = SCHED_IDLE; else policy = SCHED_NORMAL; @@ -11297,19 +11297,27 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, if (!tg->se[0]) return -EINVAL;
- if (qos_level != -1 && qos_level != 0) +#ifdef CONFIG_QOS_SCHED_MULTILEVEL + if (qos_level > QOS_LEVEL_HIGH_EX || qos_level < QOS_LEVEL_OFFLINE_EX) +#else + if (qos_level != QOS_LEVEL_OFFLINE && qos_level != QOS_LEVEL_ONLINE) +#endif return -EINVAL;
if (tg->qos_level == qos_level) goto done;
- if (tg->qos_level == -1 && qos_level == 0) +#ifdef CONFIG_QOS_SCHED_MULTILEVEL + if (!is_normal_level(tg->qos_level)) +#else + if (tg->qos_level == QOS_LEVEL_OFFLINE && qos_level == QOS_LEVEL_ONLINE) +#endif return -EINVAL;
cpus_read_lock(); - if (qos_level == -1) + if (is_offline_level(qos_level)) cfs_bandwidth_usage_inc(); - else + else if (is_offline_level(tg->qos_level) && !is_offline_level(qos_level)) cfs_bandwidth_usage_dec(); cpus_read_unlock();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9af04551788..3280ae7edd9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -190,6 +190,23 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif
+#ifdef CONFIG_QOS_SCHED_MULTILEVEL +#define QOS_LEVEL_WEIGHT_OFFLINE_EX 1 +#define QOS_LEVEL_WEIGHT_OFFLINE 10 +#define QOS_LEVEL_WEIGHT_ONLINE 100 +#define QOS_LEVEL_WEIGHT_HIGH 1000 +#define QOS_LEVEL_WEIGHT_HIGH_EX 10000 + +unsigned int sysctl_qos_level_weights[5] = { + QOS_LEVEL_WEIGHT_OFFLINE_EX, + QOS_LEVEL_WEIGHT_OFFLINE, + QOS_LEVEL_WEIGHT_ONLINE, + QOS_LEVEL_WEIGHT_HIGH, + QOS_LEVEL_WEIGHT_HIGH_EX, +}; +static long qos_reweight(long shares, struct task_group *tg); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -3528,6 +3545,9 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg;
tg_shares = READ_ONCE(tg->shares); +#ifdef CONFIG_QOS_SCHED_MULTILEVEL + tg_shares = qos_reweight(tg_shares, tg); +#endif
load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
@@ -3574,6 +3594,9 @@ static void update_cfs_group(struct sched_entity *se)
#ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); +#ifdef CONFIG_QOS_SCHED_MULTILEVEL + shares = qos_reweight(shares, gcfs_rq->tg); +#endif
if (likely(se->load.weight == shares)) return; @@ -8258,7 +8281,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
static inline bool is_offline_task(struct task_struct *p) { - return task_group(p)->qos_level == -1; + return task_group(p)->qos_level < QOS_LEVEL_ONLINE; }
static void start_qos_hrtimer(int cpu); @@ -8433,7 +8456,7 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) if (unlikely(__this_cpu_read(qos_cpu_overload))) return false;
- if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + if (unlikely(cfs_rq && is_offline_level(cfs_rq->tg->qos_level) && !sched_idle_cpu(smp_processor_id()) && cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { throttle_qos_cfs_rq(cfs_rq); @@ -8449,7 +8472,7 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) struct rq_flags rf;
rq_lock_irqsave(rq, &rf); - if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq)) + if (is_offline_level(cfs_rq->tg->qos_level) && cfs_rq_throttled(cfs_rq)) unthrottle_qos_cfs_rq(cfs_rq); rq_unlock_irqrestore(rq, &rf); } @@ -8462,7 +8485,7 @@ void sched_qos_offline_wait(void) rcu_read_lock(); qos_level = task_group(current)->qos_level; rcu_read_unlock(); - if (qos_level != -1 || fatal_signal_pending(current)) + if (!is_offline_level(qos_level) || fatal_signal_pending(current)) break;
schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval)); @@ -8487,6 +8510,39 @@ static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) return HRTIMER_NORESTART; }
+#ifdef CONFIG_QOS_SCHED_MULTILEVEL +static long qos_reweight(long shares, struct task_group *tg) +{ + long qos_weight = 100; + long div = 100; + long scale_shares; + + switch (tg->qos_level) { + case QOS_LEVEL_OFFLINE_EX: + qos_weight = sysctl_qos_level_weights[0]; + break; + case QOS_LEVEL_OFFLINE: + qos_weight = sysctl_qos_level_weights[1]; + break; + case QOS_LEVEL_ONLINE: + qos_weight = sysctl_qos_level_weights[2]; + break; + case QOS_LEVEL_HIGH: + qos_weight = sysctl_qos_level_weights[3]; + break; + case QOS_LEVEL_HIGH_EX: + qos_weight = sysctl_qos_level_weights[4]; + break; + } + if (qos_weight > LONG_MAX / shares) + scale_shares = LONG_MAX / div; + else + scale_shares = shares * qos_weight / div; + scale_shares = clamp_t(long, scale_shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); + return scale_shares; +} +#endif + static void start_qos_hrtimer(int cpu) { ktime_t time; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f4e65a5e3009..a9eecb76dfff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1213,6 +1213,39 @@ static inline bool is_migration_disabled(struct task_struct *p) #endif }
+#ifdef CONFIG_QOS_SCHED + +#ifdef CONFIG_QOS_SCHED_MULTILEVEL +enum task_qos_level { + QOS_LEVEL_OFFLINE_EX = -2, + QOS_LEVEL_OFFLINE = -1, + QOS_LEVEL_ONLINE = 0, + QOS_LEVEL_HIGH = 1, + QOS_LEVEL_HIGH_EX = 2 +}; +#else +enum task_qos_level { + QOS_LEVEL_OFFLINE = -1, + QOS_LEVEL_ONLINE = 0, +}; +#endif + +static inline int is_high_level(long qos_level) +{ + return qos_level > QOS_LEVEL_ONLINE; +} + +static inline int is_normal_level(long qos_level) +{ + return qos_level == QOS_LEVEL_ONLINE; +} + +static inline int is_offline_level(long qos_level) +{ + return qos_level < QOS_LEVEL_ONLINE; +} +#endif + DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e9af234bf882..1714abd73f23 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2068,6 +2068,15 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE_HUNDRED, .extra2 = &one_thousand, }, +#endif +#ifdef CONFIG_QOS_SCHED_MULTILEVEL + { + .procname = "qos_level_weights", + .data = &sysctl_qos_level_weights, + .maxlen = 5*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { .procname = "max_rcu_stall_to_panic",