From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
We introduce the idea of qos level to scheduler, which now is supported with different scheduler policies. The qos scheduler will change the policy of correlative tasks when the qos level of a task group is modified with cpu.qos_level cpu cgroup file. In this way we are able to satisfy different needs of tasks in different qos levels.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- init/Kconfig | 8 ++++ kernel/sched/core.c | 93 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 ++ 3 files changed, 105 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index c05347a29ca4d..a338519692d54 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -777,6 +777,14 @@ menuconfig CGROUP_SCHED tasks.
if CGROUP_SCHED +config QOS_SCHED + bool "Qos task scheduling" + depends on CGROUP_SCHED + depends on CFS_BANDWIDTH + depends on X86 + + default n + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8866cd7f19c43..23160df884e49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6338,6 +6338,15 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_QOS_SCHED +static int alloc_qos_sched_group(struct task_group *tg, struct task_group *parent) +{ + tg->qos_level = parent->qos_level; + + return 1; +} +#endif + static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -6358,6 +6367,11 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_QOS_SCHED + if (!alloc_qos_sched_group(tg, parent)) + goto err; +#endif + if (!alloc_rt_sched_group(tg, parent)) goto err;
@@ -6426,6 +6440,30 @@ static void sched_change_group(struct task_struct *tsk, int type) tg = autogroup_task_group(tsk, tg); tsk->sched_task_group = tg;
+#ifdef CONFIG_QOS_SCHED + /* + * No need to re-setcheduler when a task is exiting or the task + * is in an autogroup. + */ + if (!rt_task(tsk) + && !(tsk->flags & PF_EXITING) + && !task_group_is_autogroup(tg)) { + struct rq *rq = task_rq(tsk); + struct sched_attr attr = { + .sched_priority = 0, + }; + + if (tg->qos_level == -1) { + attr.sched_policy = SCHED_IDLE; + } else { + attr.sched_policy = SCHED_NORMAL; + } + attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); + + __setscheduler(rq, tsk, &attr, 0); + } +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk, type); @@ -6886,6 +6924,54 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_QOS_SCHED +static int cpu_qos_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 qos_level) +{ + struct css_task_iter it; + struct task_struct *tsk; + struct task_group *tg; + struct sched_param param; + int pid, policy; + tg = css_tg(css); + + if (!tg->se[0]) + return -EINVAL; + + if (qos_level != -1 && qos_level != 0) + return -EINVAL; + + if (tg->qos_level == qos_level) + goto done; + + if (qos_level == -1) { + policy = SCHED_IDLE; + } else { + policy = SCHED_NORMAL; + } + + tg->qos_level = qos_level; + + param.sched_priority = 0; + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) { + pid = task_tgid_vnr(tsk); + + if (pid > 0 && !rt_task(tsk)) + sched_setscheduler(tsk, policy, ¶m); + } + css_task_iter_end(&it); + +done: + return 0; +} + +static s64 cpu_qos_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return css_tg(css)->qos_level; +} +#endif /* CONFIG_QOS_SCHED */ + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -6921,6 +7007,13 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_rt_period_read_uint, .write_u64 = cpu_rt_period_write_uint, }, +#endif +#ifdef CONFIG_QOS_SCHED + { + .name = "qos_level", + .read_s64 = cpu_qos_read, + .write_s64 = cpu_qos_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6238db9dc996..c263cb2f35c5d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -402,7 +402,11 @@ struct task_group {
struct cfs_bandwidth cfs_bandwidth;
+#if defined(CONFIG_QOS_SCHED) && !defined(__GENKSYMS__) + long qos_level; +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) };
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
In a co-location scenario, we usually deploy online and offline task groups in the same server.
The online tasks are more important than offline tasks. And to avoid offline tasks affects online tasks, we will throttle the offline tasks group when some online task groups running in the same cpu and unthrottle offline tasks when the cpu is about to enter idle state.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/core.c | 2 + kernel/sched/fair.c | 146 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 23160df884e49..83818e2df1b52 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6946,8 +6946,10 @@ static int cpu_qos_write(struct cgroup_subsys_state *css,
if (qos_level == -1) { policy = SCHED_IDLE; + cfs_bandwidth_usage_inc(); } else { policy = SCHED_NORMAL; + cfs_bandwidth_usage_dec(); }
tg->qos_level = qos_level; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1ebe39020a7b2..24ea9b7a716e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -99,6 +99,10 @@ int __weak arch_asym_cpu_priority(int cpu) } #endif
+#ifdef CONFIG_QOS_SCHED +static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -6868,6 +6872,128 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ set_last_buddy(se); }
+#ifdef CONFIG_QOS_SCHED +static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + long task_delta, idle_task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* freeze hierarchy runnable averages while throttled */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) { + sub_nr_running(rq, task_delta); + } + + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + + list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); +} + +static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq)]; + + cfs_rq->throttled = 0; + + update_rq_clock(rq); + + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + list_del_init(&cfs_rq->throttled_list); + + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + assert_list_leaf_cfs_rq(rq); + + if (!se) { + add_nr_running(rq, task_delta); + } + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); +} + +static int unthrottle_qos_cfs_rqs(int cpu) +{ + struct cfs_rq *cfs_rq, *tmp_rq; + int res = 0; + + list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), + throttled_list) { + if (cfs_rq_throttled(cfs_rq)) { + unthrottle_qos_cfs_rq(cfs_rq); + res++; + } + } + + return res; +} + +static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_bandwidth_used()) + return false; + + if (cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(cpu_of(rq_of(cfs_rq)))) { + throttle_qos_cfs_rq(cfs_rq); + return true; + } + + return false; +} +#endif + static struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -6926,6 +7052,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); +#ifdef CONFIG_QOS_SCHED + if (check_qos_cfs_rq(cfs_rq)) { + cfs_rq = &rq->cfs; + BUG_ON(cfs_rq->nr_running == 0); + } +#endif } while (cfs_rq);
p = task_of(se); @@ -7015,6 +7147,12 @@ done: __maybe_unused; if (new_tasks > 0) goto again;
+#ifdef CONFIG_QOS_SCHED + if (unthrottle_qos_cfs_rqs(cpu_of(rq))) { + goto again; + } +#endif + return NULL; }
@@ -10688,6 +10826,14 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void) { +#ifdef CONFIG_QOS_SCHED + int i; + + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); + } +#endif + #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
From: Chen Hui clare.chenhui@huawei.com
hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 854f10c491880..0b103110f821c 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -130,6 +130,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
In order for the throttled task to continue running after the CPU is offline, we need unthrottle the throttled cfs rq saved in the qos_throttled_cfs_rq.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24ea9b7a716e9..01ae007760376 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -101,6 +101,8 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); + +static int unthrottle_qos_cfs_rqs(int cpu); #endif
#ifdef CONFIG_CFS_BANDWIDTH @@ -5131,6 +5133,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); + +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif }
#else /* CONFIG_CFS_BANDWIDTH */
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
Added restrictions on modifying task scheduling policies: 1. task cannot be changed from offline to online. 2. When the scheduling policy of parent task is modified, the scheduling policy of all child node tasks is changed to be the same as that of the parent node.
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/core.c | 46 ++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83818e2df1b52..e6f3f88d19d8b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6925,25 +6925,16 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, #endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_QOS_SCHED -static int cpu_qos_write(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 qos_level) +static int tg_change_scheduler(struct task_group *tg, void *data) { + int pid, policy; struct css_task_iter it; - struct task_struct *tsk; - struct task_group *tg; struct sched_param param; - int pid, policy; - tg = css_tg(css); - - if (!tg->se[0]) - return -EINVAL; - - if (qos_level != -1 && qos_level != 0) - return -EINVAL; - - if (tg->qos_level == qos_level) - goto done; + struct task_struct *tsk; + s64 qos_level = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css;
+ tg->qos_level = qos_level; if (qos_level == -1) { policy = SCHED_IDLE; cfs_bandwidth_usage_inc(); @@ -6952,8 +6943,6 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, cfs_bandwidth_usage_dec(); }
- tg->qos_level = qos_level; - param.sched_priority = 0; css_task_iter_start(css, 0, &it); while ((tsk = css_task_iter_next(&it))) { @@ -6964,6 +6953,29 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, } css_task_iter_end(&it);
+ return 0; +} + +static int cpu_qos_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 qos_level) +{ + struct task_group *tg = css_tg(css); + + if (!tg->se[0]) + return -EINVAL; + + if (qos_level != -1 && qos_level != 0) + return -EINVAL; + + if (tg->qos_level == qos_level) + goto done; + + if (tg->qos_level == -1 && qos_level == 0) + return -EINVAL; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); + rcu_read_unlock(); done: return 0; }
From: Chen Hui clare.chenhui@huawei.com
hulk inclusion category: bugfix bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
Remove residual checkings when move tasks to new task group or write new value to the cpu.qos_level cgroup file.
Signed-off-by: Chen Hui clare.chenhui@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/core.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e6f3f88d19d8b..1e8fca9b53d84 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6445,8 +6445,7 @@ static void sched_change_group(struct task_struct *tsk, int type) * No need to re-setcheduler when a task is exiting or the task * is in an autogroup. */ - if (!rt_task(tsk) - && !(tsk->flags & PF_EXITING) + if (!(tsk->flags & PF_EXITING) && !task_group_is_autogroup(tg)) { struct rq *rq = task_rq(tsk); struct sched_attr attr = { @@ -6455,12 +6454,9 @@ static void sched_change_group(struct task_struct *tsk, int type)
if (tg->qos_level == -1) { attr.sched_policy = SCHED_IDLE; - } else { - attr.sched_policy = SCHED_NORMAL; + attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); + __setscheduler(rq, tsk, &attr, 0); } - attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); - - __setscheduler(rq, tsk, &attr, 0); } #endif
@@ -6927,7 +6923,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, #ifdef CONFIG_QOS_SCHED static int tg_change_scheduler(struct task_group *tg, void *data) { - int pid, policy; + int policy; struct css_task_iter it; struct sched_param param; struct task_struct *tsk; @@ -6945,12 +6941,8 @@ static int tg_change_scheduler(struct task_group *tg, void *data)
param.sched_priority = 0; css_task_iter_start(css, 0, &it); - while ((tsk = css_task_iter_next(&it))) { - pid = task_tgid_vnr(tsk); - - if (pid > 0 && !rt_task(tsk)) - sched_setscheduler(tsk, policy, ¶m); - } + while ((tsk = css_task_iter_next(&it))) + sched_setscheduler(tsk, policy, ¶m); css_task_iter_end(&it);
return 0;
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: bugfix bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
offline task invokes sched_setscheduler interface to change the scheduling policy to SCHED_OTHER, trigger a system panic.
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/fair.c | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1e8fca9b53d84..155391164de11 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4333,6 +4333,18 @@ static int __sched_setscheduler(struct task_struct *p, } change:
+#ifdef CONFIG_QOS_SCHED + /* + * If the scheduling policy of an offline task is set to a policy + * other than SCHED_IDLE, the online task preemption will be invalid, + * so return -EINVAL in this case. + */ + if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { + task_rq_unlock(rq, p, &rf); + return -EINVAL; + } +#endif + if (user) { #ifdef CONFIG_RT_GROUP_SCHED /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01ae007760376..1dd1e34f3d596 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6991,7 +6991,8 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) return false;
if (cfs_rq && cfs_rq->tg->qos_level < 0 && - !sched_idle_cpu(cpu_of(rq_of(cfs_rq)))) { + !sched_idle_cpu(cpu_of(rq_of(cfs_rq))) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running) { throttle_qos_cfs_rq(cfs_rq); return true; }
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: bugfix bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/fair.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1dd1e34f3d596..68adc25c1a249 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6984,21 +6984,6 @@ static int unthrottle_qos_cfs_rqs(int cpu)
return res; } - -static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return false; - - if (cfs_rq && cfs_rq->tg->qos_level < 0 && - !sched_idle_cpu(cpu_of(rq_of(cfs_rq))) && - cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running) { - throttle_qos_cfs_rq(cfs_rq); - return true; - } - - return false; -} #endif
static struct task_struct * @@ -7060,9 +7045,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); #ifdef CONFIG_QOS_SCHED - if (check_qos_cfs_rq(cfs_rq)) { + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(cpu_of(rq)) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + throttle_qos_cfs_rq(cfs_rq); cfs_rq = &rq->cfs; - BUG_ON(cfs_rq->nr_running == 0); + WARN_ON(cfs_rq->nr_running == 0); } #endif } while (cfs_rq);
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: bugfix bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
If online tasks occupy 100% CPU resources, offline tasks can't be scheduled since offline tasks are throttled, as a result, offline task can't timely respond after receiving SIGKILL signal.
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/cgroup.h | 4 ++++ include/linux/sched.h | 4 ++++ kernel/cgroup/cgroup.c | 22 ++++++++++++++++++++++ kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++ kernel/signal.c | 3 +++ 5 files changed, 65 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 02da4e1def61e..f2273649c31b6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -886,4 +886,8 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) free_cgroup_ns(ns); }
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk); +#endif + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ef03ea1450215..f3aecacbb1147 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1951,4 +1951,8 @@ static inline void rseq_syscall(struct pt_regs *regs)
#endif
+#ifdef CONFIG_QOS_SCHED +void sched_move_offline_task(struct task_struct *p); +#endif + #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 682c5e231bddc..6f4dcdd6f77b2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2719,6 +2719,28 @@ void cgroup_procs_write_finish(struct task_struct *task) ss->post_attach(); }
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk) +{ + struct css_set *css; + struct cgroup *cpu_cgrp; + struct cgroup *cpu_root_cgrp; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + + spin_lock_irq(&css_set_lock); + css = task_css_set(tsk); + cpu_cgrp = css->subsys[cpu_cgrp_id]->cgroup; + cpu_root_cgrp = &cpu_cgrp->root->cgrp; + spin_unlock_irq(&css_set_lock); + + (void)cgroup_attach_task(cpu_root_cgrp, tsk, false); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); +} +#endif + static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 155391164de11..b9adb15796748 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6357,6 +6357,38 @@ static int alloc_qos_sched_group(struct task_group *tg, struct task_group *paren
return 1; } + +struct offline_args { + struct work_struct work; + struct task_struct *p; +}; + +static void sched_move_work(struct work_struct *work) +{ + struct sched_param param = { .sched_priority = 0 }; + struct offline_args *args = container_of(work, struct offline_args, work); + + cgroup_move_task_to_root(args->p); + sched_setscheduler(args->p, SCHED_NORMAL, ¶m); + put_task_struct(args->p); + kfree(args); +} + +void sched_move_offline_task(struct task_struct *p) +{ + struct offline_args *args; + + if (unlikely(task_group(p)->qos_level != -1)) + return; + + args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC); + if (args) { + get_task_struct(p); + args->p = p; + INIT_WORK(&args->work, sched_move_work); + queue_work(system_highpri_wq, &args->work); + } +} #endif
static void sched_free_group(struct task_group *tg) diff --git a/kernel/signal.c b/kernel/signal.c index 1cb65922a6a97..dd8690d861729 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1040,6 +1040,9 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) signal->group_stop_count = 0; t = p; do { +#ifdef CONFIG_QOS_SCHED + sched_move_offline_task(t); +#endif task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1);
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: bugfix bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
When unthrottle offline tasks successfully, we should clear idle_stamp.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 68adc25c1a249..d6b16c99c8656 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7144,6 +7144,7 @@ done: __maybe_unused;
#ifdef CONFIG_QOS_SCHED if (unthrottle_qos_cfs_rqs(cpu_of(rq))) { + rq->idle_stamp = 0; goto again; } #endif
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: bugfix bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
In some corner case, when we throttle the last group sched entity in rq, kernel will be panic if there is no residual sched entities in rq. So we add a protection to prevent this case.
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6b16c99c8656..229ae0adfcf3f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7050,7 +7050,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { throttle_qos_cfs_rq(cfs_rq); cfs_rq = &rq->cfs; - WARN_ON(cfs_rq->nr_running == 0); + WARN(cfs_rq->nr_running == 0, + "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n", + rq->nr_running, cfs_rq->idle_h_nr_running); + if (unlikely(!cfs_rq->nr_running)) + return NULL; } #endif } while (cfs_rq);
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/fair.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 229ae0adfcf3f..5aaf9312cc295 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10544,6 +10544,19 @@ static void task_change_group_fair(struct task_struct *p, int type) } }
+#ifdef CONFIG_QOS_SCHED +static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq)) + unthrottle_qos_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); +} +#endif + void free_fair_sched_group(struct task_group *tg) { int i; @@ -10551,6 +10564,10 @@ void free_fair_sched_group(struct task_group *tg) destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) { +#ifdef CONFIG_QOS_SCHED + if (tg->cfs_rq) + unthrottle_qos_sched_group(tg->cfs_rq[i]); +#endif if (tg->cfs_rq) kfree(tg->cfs_rq[i]); if (tg->se)
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: feature bugzilla: 51828, https://gitee.com/openeuler/kernel/issues/I4K96G CVE: NA
--------------------------------
When online tasks occupy cpu long time, offline task will not get cpu to run, the priority inversion issue may be triggered in this case. If the above case occurs, we will unthrottle offline tasks and let its get a chance to run. When online tasks occupy cpu over 5s(defaule value), we will unthrottle offline tasks and enter a msleep loop before exit to usermode util the cpu goto idle.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/entry/common.c | 7 ++- include/linux/sched.h | 7 +++ include/linux/sched/sysctl.h | 5 ++ kernel/sched/core.c | 3 ++ kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 3 ++ kernel/sysctl.c | 23 +++++++++ 7 files changed, 139 insertions(+), 8 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0723098a3961a..c66c9d141a326 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -162,6 +162,10 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) if (cached_flags & _TIF_SIGPENDING) do_signal(regs);
+#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + if (cached_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); @@ -194,7 +198,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
cached_flags = READ_ONCE(ti->flags);
- if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + if (unlikely((cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS) || + sched_qos_cpu_overload())) exit_to_usermode_loop(regs, cached_flags);
#ifdef CONFIG_COMPAT diff --git a/include/linux/sched.h b/include/linux/sched.h index f3aecacbb1147..2d864748d696f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1953,6 +1953,13 @@ static inline void rseq_syscall(struct pt_regs *regs)
#ifdef CONFIG_QOS_SCHED void sched_move_offline_task(struct task_struct *p); +void sched_qos_offline_wait(void); +int sched_qos_cpu_overload(void); +#else +static inline int sched_qos_cpu_overload(void) +{ + return 0; +} #endif
#endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index cabdc1ed134a0..b277fbc807ecb 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -67,6 +67,11 @@ extern int sysctl_sched_rt_runtime; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif
+#ifdef CONFIG_QOS_SCHED +extern unsigned int sysctl_overload_detect_period; +extern unsigned int sysctl_offline_wait_interval; +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b9adb15796748..487091389934e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6115,6 +6115,9 @@ void __init sched_init(void) * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ init_cfs_bandwidth(&root_task_group.cfs_bandwidth); +#ifdef CONFIG_QOS_SCHED + init_qos_hrtimer(i); +#endif init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5aaf9312cc295..52029f3a74394 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -24,7 +24,9 @@ #ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" #endif - +#ifdef CONFIG_QOS_SCHED +#include <linux/delay.h> +#endif #include <trace/events/sched.h>
/* @@ -101,7 +103,10 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); - +static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); +static DEFINE_PER_CPU(int, qos_cpu_overload); +unsigned int sysctl_overload_detect_period = 5000; /* in ms */ +unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif
@@ -6879,6 +6884,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED +static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -6913,6 +6919,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) sub_nr_running(rq, task_delta); }
+ if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + start_qos_hrtimer(cpu_of(rq)); + cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq);
@@ -6969,7 +6978,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); }
-static int unthrottle_qos_cfs_rqs(int cpu) +static int __unthrottle_qos_cfs_rqs(int cpu) { struct cfs_rq *cfs_rq, *tmp_rq; int res = 0; @@ -6984,6 +6993,83 @@ static int unthrottle_qos_cfs_rqs(int cpu)
return res; } + +static int unthrottle_qos_cfs_rqs(int cpu) +{ + int res; + + res = __unthrottle_qos_cfs_rqs(cpu); + if (res) + hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + + return res; +} + +static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (unlikely(__this_cpu_read(qos_cpu_overload))) { + return false; + } + + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + throttle_qos_cfs_rq(cfs_rq); + return true; + } + + return false; +} + +void sched_qos_offline_wait(void) +{ + long qos_level; + + while (unlikely(this_cpu_read(qos_cpu_overload))) { + rcu_read_lock(); + qos_level = task_group(current)->qos_level; + rcu_read_unlock(); + if (qos_level != -1 || signal_pending(current)) + break; + msleep_interruptible(sysctl_offline_wait_interval); + } +} + +int sched_qos_cpu_overload(void) +{ + return __this_cpu_read(qos_cpu_overload); +} + +static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) +{ + struct rq_flags rf; + struct rq *rq = this_rq(); + + rq_lock_irqsave(rq, &rf); + if (__unthrottle_qos_cfs_rqs(smp_processor_id())) + __this_cpu_write(qos_cpu_overload, 1); + rq_unlock_irqrestore(rq, &rf); + + return HRTIMER_NORESTART; +} + +static void start_qos_hrtimer(int cpu) +{ + ktime_t time; + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period); + hrtimer_set_expires(hrtimer, time); + hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED); +} + +void init_qos_hrtimer(int cpu) +{ + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer->function = qos_overload_timer_handler; +} #endif
static struct task_struct * @@ -7045,10 +7131,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); #ifdef CONFIG_QOS_SCHED - if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && - !sched_idle_cpu(cpu_of(rq)) && - cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { - throttle_qos_cfs_rq(cfs_rq); + if (check_qos_cfs_rq(cfs_rq)) { cfs_rq = &rq->cfs; WARN(cfs_rq->nr_running == 0, "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n", @@ -7151,6 +7234,8 @@ done: __maybe_unused; rq->idle_stamp = 0; goto again; } + + __this_cpu_write(qos_cpu_overload, 0); #endif
return NULL; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c263cb2f35c5d..1aaff1aa89f62 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -994,6 +994,9 @@ static inline int cpu_of(struct rq *rq) #endif }
+#ifdef CONFIG_QOS_SCHED +void init_qos_hrtimer(int cpu); +#endif
#ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7d3d7ad953df2..35512e2ea8a34 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -138,6 +138,9 @@ static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_QOS_SCHED +static int hundred_thousand = 100000; +#endif #ifdef CONFIG_PERF_EVENTS static int six_hundred_forty_kb = 640 * 1024; #endif @@ -1280,6 +1283,26 @@ static struct ctl_table kern_table[] = { .extra2 = &three, },
+#endif +#ifdef CONFIG_QOS_SCHED + { + .procname = "qos_overload_detect_period_ms", + .data = &sysctl_overload_detect_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one_thousand, + .extra2 = &hundred_thousand, + }, + { + .procname = "qos_offline_wait_interval_ms", + .data = &sysctl_offline_wait_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one_hundred, + .extra2 = &one_thousand, + }, #endif { } };