*** BLURB HERE ***
Xia Fukun (1): sched/qos: Fix warning in CPU hotplug scenarios
Zhang Qiao (15): sched: Introduce qos scheduler for co-location sched: Throttle qos cfs_rq when current cpu is running online task sched: Fix offline task can't be killed in a timely sched: Unthrottle the throttled cfs rq when offline rq sched: Unthrottle qos cfs rq when free a task group sched: bugfix setscheduler unlock cpuset_rwsem sched: Introduce handle priority reversion mechanism sched: enable CONFIG_QOS_SCHED on arm64 sched: Fix sleeping in atomic context at cpu_qos_write() sched/fair: Add qos_throttle_list node in struct cfs_rq sched: Throttle offline task at tracehook_notify_resume() sched/qos: Add qos_tg_{throttle,unthrottle}_{up,down} sched/fair: Update rq clock before unthrottle a qos cfs_rq sched/qos: Don't unthrottle cfs_rq when cfs_rq is throttled by qos sched: Enable qos scheduler config
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/cgroup.h | 4 + include/linux/resume_user_mode.h | 5 + include/linux/sched.h | 23 ++ include/linux/sched/sysctl.h | 8 + init/Kconfig | 12 + kernel/cgroup/cgroup.c | 22 ++ kernel/sched/core.c | 155 +++++++++++ kernel/sched/fair.c | 357 ++++++++++++++++++++++++- kernel/sched/sched.h | 12 + kernel/signal.c | 3 + kernel/sysctl.c | 24 ++ 13 files changed, 626 insertions(+), 1 deletion(-)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
In cloud-native hybrid deployment scenarios, online tasks must preempt offline tasks in a timely and offline tasks can't affect the QoS of online tasks, so we introduce the idea of qos level to scheduler, which now is supported with different scheduler policies. The qos scheduler will change the policy of correlative tasks when the qos level of a task group is modified with cpu.qos_level cpu cgroup file. In this way we are able to satisfy different needs of tasks in different qos levels.
The value of qos_level can be 0 or -1, default value is 0. If qos_level is 0, the group is an online group. otherwise it is an offline group.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- init/Kconfig | 12 +++++ kernel/sched/core.c | 116 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 ++ 3 files changed, 132 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 6d35728b94b2..be94ba25ecca 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -985,6 +985,18 @@ menuconfig CGROUP_SCHED tasks.
if CGROUP_SCHED +config QOS_SCHED + bool "Qos task scheduling" + depends on CGROUP_SCHED + depends on CFS_BANDWIDTH + default n + help + This option enable qos scheduler, and support co-location online + services (Latency Sensitive) and offline tasks. colocation can + effectively improve the resource utilization. + + If in doubt, say N. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9e9a45a3394f..9f466f43c023 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7730,6 +7730,18 @@ static int __sched_setscheduler(struct task_struct *p, } change:
+#ifdef CONFIG_QOS_SCHED + /* + * If the scheduling policy of an offline task is set to a policy + * other than SCHED_IDLE, the online task preemption and cpu resource + * isolation will be invalid, so return -EINVAL in this case. + */ + if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { + task_rq_unlock(rq, p, &rf); + return -EINVAL; + } +#endif + if (user) { #ifdef CONFIG_RT_GROUP_SCHED /* @@ -10348,6 +10360,35 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_QOS_SCHED +static inline int alloc_qos_sched_group(struct task_group *tg, + struct task_group *parent) +{ + tg->qos_level = parent->qos_level; + + return 1; +} + +static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg) +{ + struct sched_attr attr = {0}; + + /* + * No need to re-setcheduler when a task is exiting or the task + * is in an autogroup. + */ + if (!(tsk->flags & PF_EXITING) && + !task_group_is_autogroup(tg) && + (tg->qos_level == -1)) { + attr.sched_priority = 0; + attr.sched_policy = SCHED_IDLE; + attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); + __setscheduler_params(tsk, &attr); + __setscheduler_prio(tsk, normal_prio(tsk)); + } +} +#endif + static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { @@ -10398,6 +10439,11 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_QOS_SCHED + if (!alloc_qos_sched_group(tg, parent)) + goto err; +#endif + if (!alloc_rt_sched_group(tg, parent)) goto err;
@@ -10484,6 +10530,10 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group { tsk->sched_task_group = group;
+#ifdef CONFIG_QOS_SCHED + sched_change_qos_group(tsk, group); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); @@ -11212,6 +11262,65 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_QOS_SCHED +static int tg_change_scheduler(struct task_group *tg, void *data) +{ + int policy; + struct css_task_iter it; + struct sched_param param; + struct task_struct *tsk; + s64 qos_level = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->qos_level = qos_level; + if (qos_level == -1) { + policy = SCHED_IDLE; + cfs_bandwidth_usage_inc(); + } else { + policy = SCHED_NORMAL; + cfs_bandwidth_usage_dec(); + } + + param.sched_priority = 0; + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_setscheduler(tsk, policy, ¶m); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_qos_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 qos_level) +{ + struct task_group *tg = css_tg(css); + + if (!tg->se[0]) + return -EINVAL; + + if (qos_level != -1 && qos_level != 0) + return -EINVAL; + + if (tg->qos_level == qos_level) + goto done; + + if (tg->qos_level == -1 && qos_level == 0) + return -EINVAL; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); + rcu_read_unlock(); +done: + return 0; +} + +static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->qos_level; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11275,6 +11384,13 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, +#endif +#ifdef CONFIG_QOS_SCHED + { + .name = "qos_level", + .read_s64 = cpu_qos_read, + .write_s64 = cpu_qos_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..3586830dbeeb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -403,6 +403,10 @@ struct task_group {
struct cfs_bandwidth cfs_bandwidth;
+#ifdef CONFIG_QOS_SCHED + long qos_level; +#endif + #ifdef CONFIG_UCLAMP_TASK_GROUP /* The two decimal precision [%] value requested from user-space */ unsigned int uclamp_pct[UCLAMP_CNT];
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
In a co-location scenario, we usually deploy online and offline task groups in the same server.
The online tasks are more important than offline tasks and to avoid offline tasks affects online tasks, we will throttle the offline tasks group when some online task groups running in the same cpu and unthrottle offline tasks when the cpu is about to enter idle state.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 145 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1795f6fe991f..155f815b467a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -124,6 +124,10 @@ int __weak arch_asym_cpu_priority(int cpu) #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif
+#ifdef CONFIG_QOS_SCHED +static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -8115,6 +8119,124 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ resched_curr(rq); }
+#ifdef CONFIG_QOS_SCHED +static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + long task_delta, idle_task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* freeze hierarchy runnable averages while throttled */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) + sub_nr_running(rq, task_delta); + + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + + list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); +} + +static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq)]; + + cfs_rq->throttled = 0; + + update_rq_clock(rq); + + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + list_del_init(&cfs_rq->throttled_list); + + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + assert_list_leaf_cfs_rq(rq); + + if (!se) + add_nr_running(rq, task_delta); + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); +} + +static int unthrottle_qos_cfs_rqs(int cpu) +{ + struct cfs_rq *cfs_rq, *tmp_rq; + int res = 0; + + list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), + throttled_list) { + if (cfs_rq_throttled(cfs_rq)) { + unthrottle_qos_cfs_rq(cfs_rq); + res++; + } + } + + return res; +} + +static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + throttle_qos_cfs_rq(cfs_rq); + return true; + } + + return false; +} +#endif + #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { @@ -8205,6 +8327,16 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); +#ifdef CONFIG_QOS_SCHED + if (check_qos_cfs_rq(cfs_rq)) { + cfs_rq = &rq->cfs; + WARN(cfs_rq->nr_running == 0, + "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n", + rq->nr_running, cfs_rq->idle_h_nr_running); + if (unlikely(!cfs_rq->nr_running)) + return NULL; + } +#endif } while (cfs_rq);
p = task_of(se); @@ -8284,6 +8416,12 @@ done: __maybe_unused; if (new_tasks > 0) goto again;
+#ifdef CONFIG_QOS_SCHED + if (unthrottle_qos_cfs_rqs(cpu_of(rq))) { + rq->idle_stamp = 0; + goto again; + } +#endif /* * rq is about to be idle, check if we need to update the * lost_idle_time of clock_pelt @@ -12974,6 +13112,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void) { +#ifdef CONFIG_QOS_SCHED + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); +#endif + #ifdef CONFIG_SMP int i;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
If online tasks occupy 100% CPU resources, offline tasks can't be scheduled since offline tasks are throttled, as a result, offline task can't timely respond after receiving SIGKILL signal.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/cgroup.h | 4 ++++ include/linux/sched.h | 16 ++++++++++++++++ kernel/cgroup/cgroup.c | 22 ++++++++++++++++++++++ kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++ kernel/signal.c | 3 +++ 5 files changed, 77 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b307013b9c6c..7fa51b600ee8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -855,4 +855,8 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
#endif /* CONFIG_CGROUP_BPF */
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk); +#endif + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..6b69f92c66fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2457,4 +2457,20 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); +void sched_prefer_cpus_free(struct task_struct *p); + +extern struct static_key_false __dynamic_affinity_switch; +static inline bool dynamic_affinity_enabled(void) +{ + return static_branch_unlikely(&__dynamic_affinity_switch); +} +#endif +#ifdef CONFIG_QOS_SCHED +void sched_move_offline_task(struct task_struct *p); +#endif + #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..e7e91c8207d7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2936,6 +2936,28 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked ss->post_attach(); }
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk) +{ + struct css_set *css; + struct cgroup *cpu_cgrp; + struct cgroup *cpu_root_cgrp; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + + spin_lock_irq(&css_set_lock); + css = task_css_set(tsk); + cpu_cgrp = css->subsys[cpu_cgrp_id]->cgroup; + cpu_root_cgrp = &cpu_cgrp->root->cgrp; + spin_unlock_irq(&css_set_lock); + + (void)cgroup_attach_task(cpu_root_cgrp, tsk, false); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); +} +#endif + static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9f466f43c023..8493c0ddb73f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10387,6 +10387,38 @@ static void sched_change_qos_group(struct task_struct *tsk, struct task_group *t __setscheduler_prio(tsk, normal_prio(tsk)); } } + +struct offline_args { + struct work_struct work; + struct task_struct *p; +}; + +static void sched_move_work(struct work_struct *work) +{ + struct sched_param param = { .sched_priority = 0 }; + struct offline_args *args = container_of(work, struct offline_args, work); + + cgroup_move_task_to_root(args->p); + sched_setscheduler(args->p, SCHED_NORMAL, ¶m); + put_task_struct(args->p); + kfree(args); +} + +void sched_move_offline_task(struct task_struct *p) +{ + struct offline_args *args; + + if (unlikely(task_group(p)->qos_level != -1)) + return; + + args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC); + if (args) { + get_task_struct(p); + args->p = p; + INIT_WORK(&args->work, sched_move_work); + queue_work(system_highpri_wq, &args->work); + } +} #endif
static inline void alloc_uclamp_sched_group(struct task_group *tg, diff --git a/kernel/signal.c b/kernel/signal.c index 09019017d669..28cddef39778 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1060,6 +1060,9 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) signal->group_stop_count = 0; t = p; do { +#ifdef CONFIG_QOS_SCHED + sched_move_offline_task(t); +#endif task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1);
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
In cpu hotplug case, when a cpu go to offline, we should unthrottle cfs_rq which be throttled on this cpu, so they will be migrated to online cpu.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 155f815b467a..88423df5a2b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -126,6 +126,7 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static int unthrottle_qos_cfs_rqs(int cpu); #endif
#ifdef CONFIG_CFS_BANDWIDTH @@ -6309,6 +6310,9 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rcu_read_unlock();
rq_clock_stop_loop_update(rq); +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif }
bool cfs_task_bw_constrained(struct task_struct *p)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
When freeing a taskgroup, we will free cfs rqs of the group, even if cfs rqs have been throttled, otherwise it will cause a UAF Bug. Therefore before freeing a taskgroup, we should unthrottle all cfs rqs belonging to the taskgroup.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 88423df5a2b8..c5cdb03be234 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8239,6 +8239,17 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
return false; } + +static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq)) + unthrottle_qos_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); +} #endif
#ifdef CONFIG_SMP @@ -12742,6 +12753,10 @@ void free_fair_sched_group(struct task_group *tg) int i;
for_each_possible_cpu(i) { +#ifdef CONFIG_QOS_SCHED + if (tg->cfs_rq) + unthrottle_qos_sched_group(tg->cfs_rq[i]); +#endif if (tg->cfs_rq) kfree(tg->cfs_rq[i]); if (tg->se)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8493c0ddb73f..4399317d88ad 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7737,8 +7737,8 @@ static int __sched_setscheduler(struct task_struct *p, * isolation will be invalid, so return -EINVAL in this case. */ if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { - task_rq_unlock(rq, p, &rf); - return -EINVAL; + retval = -EINVAL; + goto unlock; } #endif
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
When online tasks occupy cpu long time, offline task will not get cpu to run, the priority inversion issue may be triggered in this case. If the above case occurs, we will unthrottle offline tasks and let its get a chance to run. When online tasks occupy cpu over 5s(defaule value), we will unthrottle offline tasks and enter a msleep loop before exit to usermode util the cpu goto idle.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/sched.h | 7 +++ include/linux/sched/sysctl.h | 8 ++++ kernel/entry/common.c | 7 ++- kernel/sched/core.c | 3 ++ kernel/sched/fair.c | 84 ++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 4 ++ kernel/sysctl.c | 24 +++++++++++ 7 files changed, 133 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6b69f92c66fa..6427e3bbb647 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2471,6 +2471,13 @@ static inline bool dynamic_affinity_enabled(void) #endif #ifdef CONFIG_QOS_SCHED void sched_move_offline_task(struct task_struct *p); +void sched_qos_offline_wait(void); +int sched_qos_cpu_overload(void); +#else +static inline int sched_qos_cpu_overload(void) +{ + return 0; +} #endif
#endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5a64582b086b..28d9be8e4614 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -29,4 +29,12 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +extern int sysctl_sched_util_low_pct; +#endif +#ifdef CONFIG_QOS_SCHED +extern unsigned int sysctl_overload_detect_period; +extern unsigned int sysctl_offline_wait_interval; +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d7ee4bc3f2ba..43698b7dd2ec 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -170,6 +170,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(regs);
+#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work);
@@ -200,7 +204,8 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare();
ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || + sched_qos_cpu_overload())) ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4399317d88ad..fb2249b4f571 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10021,6 +10021,9 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ +#ifdef CONFIG_QOS_SCHED + init_qos_hrtimer(i); +#endif init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c5cdb03be234..fabd201adbb2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -57,6 +57,10 @@ #include "stats.h" #include "autogroup.h"
+#ifdef CONFIG_QOS_SCHED +#include <linux/delay.h> +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -126,6 +130,10 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); +static DEFINE_PER_CPU(int, qos_cpu_overload); +unsigned int sysctl_overload_detect_period = 5000; /* in ms */ +unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif
@@ -8124,6 +8132,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED +static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -8157,6 +8166,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta);
+ if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + start_qos_hrtimer(cpu_of(rq)); + cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq);
@@ -8212,7 +8224,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); }
-static int unthrottle_qos_cfs_rqs(int cpu) +static int __unthrottle_qos_cfs_rqs(int cpu) { struct cfs_rq *cfs_rq, *tmp_rq; int res = 0; @@ -8228,11 +8240,25 @@ static int unthrottle_qos_cfs_rqs(int cpu) return res; }
+static int unthrottle_qos_cfs_rqs(int cpu) +{ + int res; + + res = __unthrottle_qos_cfs_rqs(cpu); + if (res) + hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + + return res; +} + static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) { + if (unlikely(__this_cpu_read(qos_cpu_overload))) + return false; + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && - !sched_idle_cpu(smp_processor_id()) && - cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { throttle_qos_cfs_rq(cfs_rq); return true; } @@ -8250,6 +8276,56 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) unthrottle_qos_cfs_rq(cfs_rq); rq_unlock_irqrestore(rq, &rf); } + +void sched_qos_offline_wait(void) +{ + long qos_level; + + while (unlikely(this_cpu_read(qos_cpu_overload))) { + rcu_read_lock(); + qos_level = task_group(current)->qos_level; + rcu_read_unlock(); + if (qos_level != -1 || signal_pending(current)) + break; + msleep_interruptible(sysctl_offline_wait_interval); + } +} + +int sched_qos_cpu_overload(void) +{ + return __this_cpu_read(qos_cpu_overload); +} + +static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) +{ + struct rq_flags rf; + struct rq *rq = this_rq(); + + rq_lock_irqsave(rq, &rf); + if (__unthrottle_qos_cfs_rqs(smp_processor_id())) + __this_cpu_write(qos_cpu_overload, 1); + rq_unlock_irqrestore(rq, &rf); + + return HRTIMER_NORESTART; +} + +static void start_qos_hrtimer(int cpu) +{ + ktime_t time; + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period); + hrtimer_set_expires(hrtimer, time); + hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED); +} + +void init_qos_hrtimer(int cpu) +{ + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer->function = qos_overload_timer_handler; +} #endif
#ifdef CONFIG_SMP @@ -8436,6 +8512,8 @@ done: __maybe_unused; rq->idle_stamp = 0; goto again; } + + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3586830dbeeb..058024a31d79 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1407,6 +1407,10 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0)
+#ifdef CONFIG_QOS_SCHED +void init_qos_hrtimer(int cpu); +#endif + #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f52..4d100e8e9046 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); static const int six_hundred_forty_kb = 640 * 1024; #endif
+#ifdef CONFIG_QOS_SCHED +static int one_thousand = 1000; +static int hundred_thousand = 100000; +#endif
static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2033,6 +2037,26 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#ifdef CONFIG_QOS_SCHED + { + .procname = "qos_overload_detect_period_ms", + .data = &sysctl_overload_detect_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &hundred_thousand, + }, + { + .procname = "qos_offline_wait_interval_ms", + .data = &sysctl_offline_wait_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &one_thousand, + }, +#endif { .procname = "max_rcu_stall_to_panic", .data = &sysctl_max_rcu_stall_to_panic,
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8f1a4db8d49b..9409921527bd 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -176,6 +176,7 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_MISC is not set +CONFIG_QOS_SCHED=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_NAMESPACES=y
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
cfs_bandwidth_usage_inc() need hold jump_label_mutex and might sleep, so we can not call it in atomic context. Fix this by moving cfs_bandwidth_usage_{inc,dec}() out of rcu read critical section.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/core.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb2249b4f571..13165ccda9ff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11308,13 +11308,10 @@ static int tg_change_scheduler(struct task_group *tg, void *data) struct cgroup_subsys_state *css = &tg->css;
tg->qos_level = qos_level; - if (qos_level == -1) { + if (qos_level == -1) policy = SCHED_IDLE; - cfs_bandwidth_usage_inc(); - } else { + else policy = SCHED_NORMAL; - cfs_bandwidth_usage_dec(); - }
param.sched_priority = 0; css_task_iter_start(css, 0, &it); @@ -11342,6 +11339,13 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, if (tg->qos_level == -1 && qos_level == 0) return -EINVAL;
+ cpus_read_lock(); + if (qos_level == -1) + cfs_bandwidth_usage_inc(); + else + cfs_bandwidth_usage_dec(); + cpus_read_unlock(); + rcu_read_lock(); walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); rcu_read_unlock();
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
-----------------------------------------------------------------
when unthrottle a cfs_rq at distribute_cfs_runtime(), another cpu may re-throttle this cfs_rq at qos_throttle_cfs_rq() before access the cfs_rq->throttle_list.next, but meanwhile, qos throttle will attach the cfs_rq throttle_list node to percpu qos_throttled_cfs_rq, it will change cfs_rq->throttle_list.next and cause panic or hardlockup at distribute_cfs_runtime().
Fix it by adding a qos_throttle_list node in struct cfs_rq, and qos throttle disuse the cfs_rq->throttle_list.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 10 +++++++--- kernel/sched/sched.h | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fabd201adbb2..20478a3285af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6204,6 +6204,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); #endif +#ifdef CONFIG_QOS_SCHED + INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); +#endif }
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -8172,7 +8175,8 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq);
- list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); + list_add(&cfs_rq->qos_throttled_list, + &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); }
static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) @@ -8190,7 +8194,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq);
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; - list_del_init(&cfs_rq->throttled_list); + list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -8230,7 +8234,7 @@ static int __unthrottle_qos_cfs_rqs(int cpu) int res = 0;
list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), - throttled_list) { + qos_throttled_list) { if (cfs_rq_throttled(cfs_rq)) { unthrottle_qos_cfs_rq(cfs_rq); res++; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 058024a31d79..3de84e95baf1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -653,6 +653,10 @@ struct cfs_rq { #endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ + +#if defined(CONFIG_QOS_SCHED) + struct list_head qos_throttled_list; +#endif };
static inline int rt_bandwidth_enabled(void)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Before, when detect the cpu is overloaded, we throttle offline tasks at exit_to_user_mode_loop() before returning to user mode. Some architects(e.g.,arm64) do not support QOS scheduler because a task do not via exit_to_user_mode_loop() return to userspace at these platforms. In order to slove this problem and support qos scheduler on all architectures, if we require throttling offline tasks, we set flag TIF_NOTIFY_RESUME to an offline task when it is picked and throttle it at tracehook_notify_resume().
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/resume_user_mode.h | 5 +++++ kernel/entry/common.c | 7 +------ kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index f8f3e958e9cf..255372856812 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,6 +59,11 @@ static inline void resume_user_mode_work(struct pt_regs *regs) blkcg_maybe_throttle_current();
rseq_handle_notify_resume(NULL, regs); + +#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + }
#endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 43698b7dd2ec..d7ee4bc3f2ba 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -170,10 +170,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(regs);
-#ifdef CONFIG_QOS_SCHED - sched_qos_offline_wait(); -#endif - /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work);
@@ -204,8 +200,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare();
ti_work = read_thread_flags(); - if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || - sched_qos_cpu_overload())) + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20478a3285af..59bfa1c59c82 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -59,6 +59,7 @@
#ifdef CONFIG_QOS_SCHED #include <linux/delay.h> +#include <linux/resume_user_mode.h> #endif
/* @@ -8135,6 +8136,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED + +static inline bool is_offline_task(struct task_struct *p) +{ + return task_group(p)->qos_level == QOS_LEVEL_OFFLINE; +} + static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { @@ -8289,9 +8296,10 @@ void sched_qos_offline_wait(void) rcu_read_lock(); qos_level = task_group(current)->qos_level; rcu_read_unlock(); - if (qos_level != -1 || signal_pending(current)) + if (qos_level != -1 || fatal_signal_pending(current)) break; - msleep_interruptible(sysctl_offline_wait_interval); + + schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval)); } }
@@ -8330,6 +8338,23 @@ void init_qos_hrtimer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); hrtimer->function = qos_overload_timer_handler; } + +/* + * To avoid Priority inversion issues, when this cpu is qos_cpu_overload, + * we should schedule offline tasks to run so that they can leave kernel + * critical sections, and throttle them before returning to user mode. + */ +static void qos_schedule_throttle(struct task_struct *p) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (unlikely(this_cpu_read(qos_cpu_overload))) { + if (is_offline_task(p)) + set_notify_resume(p); + } +} + #endif
#ifdef CONFIG_SMP @@ -8492,6 +8517,10 @@ done: __maybe_unused; update_misfit_status(p, rq); sched_fair_update_stop_tick(rq, p);
+#ifdef CONFIG_QOS_SCHED + qos_schedule_throttle(p); +#endif + return p;
idle:
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
1. Qos throttle reuse tg_{throttle,unthrottle}_{up,down} that can write some cfs-bandwidth fields, it may cause some unknown data error. So add qos_tg_{throttle,unthrottle}_{up,down} for qos throttle.
2. walk_tg_tree_from() caller must hold rcu_lock, currently there is none, so add it now.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59bfa1c59c82..8bad1fb18e92 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6297,6 +6297,9 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * the rq clock again in unthrottle_cfs_rq(). */ rq_clock_start_loop_update(rq); +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif
rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -8143,6 +8146,27 @@ static inline bool is_offline_task(struct task_struct *p) }
static void start_qos_hrtimer(int cpu); + +static int qos_tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; + + return 0; +} + +static int qos_tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count++; + + return 0; +} + static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -8153,7 +8177,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock();
task_delta = cfs_rq->h_nr_running; @@ -8180,7 +8204,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) start_qos_hrtimer(cpu_of(rq));
cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq);
list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -8189,7 +8212,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; long task_delta, idle_task_delta; @@ -8198,13 +8220,12 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
- update_rq_clock(rq); - - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */ - walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq); + rcu_read_unlock();
if (!cfs_rq->load.weight) return;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
------------[ cut here]------------ rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 5 PID: 3312 at kernel/sched/sched.h:1223 update_curr+0x1e5/0x210 CPU: 5 PID: 3312 Comm: a.out Tainted: G S5.10.0.zq+ #1 Hardware name: Huawei RH2288H V3/BC11HGSA0, BIOS 3.35 10/20/2016 RIP: 0010:update_curr+0x1e5/0x210 enqueue_entity+0x378/0xd00 unthrottle_qos_cfs_rq+0x1bc/0x2a0 __unthrottle_qos_cfs_rqs+0x87/0xa0 qos_overload_timer_handler+0x35/0x60 __run_hrtimer+0x5e/0x190 __hrtimer_run_queues+0x81/0xe0 hrtimer_interrupt+0x110/0x2c0 __sysvec_apic_timer_interrupt+0x5f/0xd0 sysvec_apic_timer_interrupt+0x31/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20
After the last rq_pin_lock(), there is no rq clock update before calling enqueue_entity() at unthrottle_qos_cfs_rq();
This patch fixes it by updating rq clock before calling enqueue_entity().
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bad1fb18e92..20e44e802f68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8220,6 +8220,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
+ update_rq_clock(rq); list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
-------------------------------
When a cfs_rq throttled by qos, mark cfs_rq->throttled as 1, and cfs bw will unthrottled this cfs_rq by mistake, it cause a list_del_valid warning. So add macro QOS_THROTTLED(=2), when a cfs_rq is throttled by qos, we mark the cfs_rq->throttled as QOS_THROTTLED, will check the value of cfs_rq->throttled before unthrottle a cfs_rq.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 139 ++++++++++++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 49 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20e44e802f68..146880588aab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -130,6 +130,13 @@ int __weak arch_asym_cpu_priority(int cpu) #endif
#ifdef CONFIG_QOS_SCHED + +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); @@ -5653,6 +5660,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq)];
+#ifdef CONFIG_QOS_SCHED + /* + * if this cfs_rq throttled by qos, not need unthrottle it. + */ + if (cfs_rq->throttled == QOS_THROTTLED) + return; +#endif + cfs_rq->throttled = 0;
update_rq_clock(rq); @@ -8142,42 +8157,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
static inline bool is_offline_task(struct task_struct *p) { - return task_group(p)->qos_level == QOS_LEVEL_OFFLINE; + return task_group(p)->qos_level == -1; }
static void start_qos_hrtimer(int cpu);
-static int qos_tg_unthrottle_up(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - cfs_rq->throttle_count--; - - return 0; -} - -static int qos_tg_throttle_down(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - cfs_rq->throttle_count++; - - return 0; -} - static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; + long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock();
task_delta = cfs_rq->h_nr_running; @@ -8186,24 +8181,44 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; + + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta;
- if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } }
- if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta);
+done: if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) start_qos_hrtimer(cpu_of(rq));
- cfs_rq->throttled = 1; + cfs_rq->throttled = QOS_THROTTLED;
list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -8213,11 +8228,13 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - int enqueue = 1; long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
+ if (cfs_rq->throttled != QOS_THROTTLED) + return; + cfs_rq->throttled = 0;
update_rq_clock(rq); @@ -8225,32 +8242,58 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); rcu_read_unlock();
- if (!cfs_rq->load.weight) - return; + if (!cfs_rq->load.weight) { + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; + }
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) - enqueue = 0; + break;
cfs_rq = cfs_rq_of(se); - if (enqueue) - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta;
if (cfs_rq_throttled(cfs_rq)) - break; + goto unthrottle_throttle; }
- assert_list_leaf_cfs_rq(rq); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se);
- if (!se) - add_nr_running(rq, task_delta); + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + add_nr_running(rq, task_delta); + +unthrottle_throttle: + + assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -13264,13 +13307,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void) { -#ifdef CONFIG_QOS_SCHED - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); -#endif - #ifdef CONFIG_SMP int i;
@@ -13286,6 +13322,11 @@ __init void init_sched_fair_class(void) #endif }
+#ifdef CONFIG_QOS_SCHED + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); +#endif + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Enable CONFIG_QOS_SCHED to support qos scheduler.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- arch/x86/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 507d199ff598..6b6cde2bbc02 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -185,6 +185,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y +CONFIG_QOS_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y CONFIG_CGROUP_PIDS=y
From: Xia Fukun xiafukun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZMCB CVE: NA
--------------------------------
CPU hotplug callbacks race against distribute_cfs_runtime(), when the QOS_SCHED feature is enabled, there may be situations where the cfs_rq-> runtime_remaining == 1 and cfs_rq is QOS_THROTTLED.
Turn off the Qos_throttle when the CPU is offline. No longer allocate time to cfs_rq in this scenario to fix the warning.
Fixes: 4eb6eb7941dc ("sched/qos: Don't unthrottle cfs_rq when cfs_rq is throttled by qos") Signed-off-by: Xia Fukun xiafukun@huawei.com --- kernel/sched/fair.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 146880588aab..500a666e29b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5852,7 +5852,20 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; #endif
- /* By the above checks, this should never be true */ + /* + * CPU hotplug callbacks race against distribute_cfs_runtime() + * when the QOS_SCHED feature is enabled, there may be + * situations where the runtime_remaining > 0. + * Qos_sched does not care whether the cfs_rq has time left, + * so no longer allocate time to cfs_rq in this scenario. + */ +#ifdef CONFIG_QOS_SCHED + if (cfs_rq->throttled == QOS_THROTTLED && + cfs_rq->runtime_remaining > 0) + goto next; +#endif + + /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock); @@ -8335,6 +8348,10 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && !sched_idle_cpu(smp_processor_id()) && cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + + if (!rq_of(cfs_rq)->online) + return false; + throttle_qos_cfs_rq(cfs_rq); return true; }
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3025 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3025 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I...