From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
In cloud-native hybrid deployment scenarios, online tasks must preempt offline tasks in a timely and offline tasks can't affect the QoS of online tasks, so we introduce the idea of qos level to scheduler, which now is supported with different scheduler policies. The qos scheduler will change the policy of correlative tasks when the qos level of a task group is modified with cpu.qos_level cpu cgroup file. In this way we are able to satisfy different needs of tasks in different qos levels.
The value of qos_level can be 0 or -1, default value is 0. If qos_level is 0, the group is an online group. otherwise it is an offline group.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- init/Kconfig | 12 +++++ kernel/sched/core.c | 116 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 ++ 3 files changed, 132 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 7a3299a632e0..d96c76143610 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -984,6 +984,18 @@ menuconfig CGROUP_SCHED tasks.
if CGROUP_SCHED +config QOS_SCHED + bool "Qos task scheduling" + depends on CGROUP_SCHED + depends on CFS_BANDWIDTH + default n + help + This option enable qos scheduler, and support co-location online + services (Latency Sensitive) and offline tasks. colocation can + effectively improve the resource utilization. + + If in doubt, say N. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89ac3c89e39e..ac7abb6c5c64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7680,6 +7680,18 @@ static int __sched_setscheduler(struct task_struct *p, } change:
+#ifdef CONFIG_QOS_SCHED + /* + * If the scheduling policy of an offline task is set to a policy + * other than SCHED_IDLE, the online task preemption and cpu resource + * isolation will be invalid, so return -EINVAL in this case. + */ + if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { + task_rq_unlock(rq, p, &rf); + return -EINVAL; + } +#endif + if (user) { #ifdef CONFIG_RT_GROUP_SCHED /* @@ -10308,6 +10320,35 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_QOS_SCHED +static inline int alloc_qos_sched_group(struct task_group *tg, + struct task_group *parent) +{ + tg->qos_level = parent->qos_level; + + return 1; +} + +static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg) +{ + struct sched_attr attr = {0}; + + /* + * No need to re-setcheduler when a task is exiting or the task + * is in an autogroup. + */ + if (!(tsk->flags & PF_EXITING) && + !task_group_is_autogroup(tg) && + (tg->qos_level == -1)) { + attr.sched_priority = 0; + attr.sched_policy = SCHED_IDLE; + attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); + __setscheduler_params(tsk, &attr); + __setscheduler_prio(tsk, normal_prio(tsk)); + } +} +#endif + static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { @@ -10358,6 +10399,11 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_QOS_SCHED + if (!alloc_qos_sched_group(tg, parent)) + goto err; +#endif + if (!alloc_rt_sched_group(tg, parent)) goto err;
@@ -10444,6 +10490,10 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group { tsk->sched_task_group = group;
+#ifdef CONFIG_QOS_SCHED + sched_change_qos_group(tsk, group); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); @@ -11146,6 +11196,65 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_QOS_SCHED +static int tg_change_scheduler(struct task_group *tg, void *data) +{ + int policy; + struct css_task_iter it; + struct sched_param param; + struct task_struct *tsk; + s64 qos_level = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->qos_level = qos_level; + if (qos_level == -1) { + policy = SCHED_IDLE; + cfs_bandwidth_usage_inc(); + } else { + policy = SCHED_NORMAL; + cfs_bandwidth_usage_dec(); + } + + param.sched_priority = 0; + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_setscheduler(tsk, policy, ¶m); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_qos_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 qos_level) +{ + struct task_group *tg = css_tg(css); + + if (!tg->se[0]) + return -EINVAL; + + if (qos_level != -1 && qos_level != 0) + return -EINVAL; + + if (tg->qos_level == qos_level) + goto done; + + if (tg->qos_level == -1 && qos_level == 0) + return -EINVAL; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); + rcu_read_unlock(); +done: + return 0; +} + +static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->qos_level; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11205,6 +11314,13 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, +#endif +#ifdef CONFIG_QOS_SCHED + { + .name = "qos_level", + .read_s64 = cpu_qos_read, + .write_s64 = cpu_qos_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7b3e0a2b20..5a8afed85a10 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,10 @@ struct task_group {
struct cfs_bandwidth cfs_bandwidth;
+#ifdef CONFIG_QOS_SCHED + long qos_level; +#endif + #ifdef CONFIG_UCLAMP_TASK_GROUP /* The two decimal precision [%] value requested from user-space */ unsigned int uclamp_pct[UCLAMP_CNT];
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
In a co-location scenario, we usually deploy online and offline task groups in the same server.
The online tasks are more important than offline tasks and to avoid offline tasks affects online tasks, we will throttle the offline tasks group when some online task groups running in the same cpu and unthrottle offline tasks when the cpu is about to enter idle state.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 145 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf9933c0158a..94290eefba36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -164,6 +164,10 @@ int __weak arch_asym_cpu_priority(int cpu) #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif
+#ifdef CONFIG_QOS_SCHED +static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -8097,6 +8101,124 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ set_last_buddy(se); }
+#ifdef CONFIG_QOS_SCHED +static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + long task_delta, idle_task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* freeze hierarchy runnable averages while throttled */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) + sub_nr_running(rq, task_delta); + + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + + list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); +} + +static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq)]; + + cfs_rq->throttled = 0; + + update_rq_clock(rq); + + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + list_del_init(&cfs_rq->throttled_list); + + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + assert_list_leaf_cfs_rq(rq); + + if (!se) + add_nr_running(rq, task_delta); + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); +} + +static int unthrottle_qos_cfs_rqs(int cpu) +{ + struct cfs_rq *cfs_rq, *tmp_rq; + int res = 0; + + list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), + throttled_list) { + if (cfs_rq_throttled(cfs_rq)) { + unthrottle_qos_cfs_rq(cfs_rq); + res++; + } + } + + return res; +} + +static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + throttle_qos_cfs_rq(cfs_rq); + return true; + } + + return false; +} +#endif + #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { @@ -8187,6 +8309,16 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); +#ifdef CONFIG_QOS_SCHED + if (check_qos_cfs_rq(cfs_rq)) { + cfs_rq = &rq->cfs; + WARN(cfs_rq->nr_running == 0, + "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n", + rq->nr_running, cfs_rq->idle_h_nr_running); + if (unlikely(!cfs_rq->nr_running)) + return NULL; + } +#endif } while (cfs_rq);
p = task_of(se); @@ -8265,6 +8397,12 @@ done: __maybe_unused; if (new_tasks > 0) goto again;
+#ifdef CONFIG_QOS_SCHED + if (unthrottle_qos_cfs_rqs(cpu_of(rq))) { + rq->idle_stamp = 0; + goto again; + } +#endif /* * rq is about to be idle, check if we need to update the * lost_idle_time of clock_pelt @@ -12884,6 +13022,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void) { +#ifdef CONFIG_QOS_SCHED + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); +#endif + #ifdef CONFIG_SMP int i;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
If online tasks occupy 100% CPU resources, offline tasks can't be scheduled since offline tasks are throttled, as a result, offline task can't timely respond after receiving SIGKILL signal.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/cgroup.h | 4 ++++ include/linux/sched.h | 4 ++++ kernel/cgroup/cgroup.c | 22 ++++++++++++++++++++++ kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++ kernel/signal.c | 3 +++ 5 files changed, 65 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 885f5395fcd0..b400d8d278c0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -857,4 +857,8 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
#endif /* CONFIG_CGROUP_BPF */
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk); +#endif + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 873a9b43e63e..2b64333aa7af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2466,4 +2466,8 @@ static inline bool dynamic_affinity_enabled(void) return static_branch_unlikely(&__dynamic_affinity_switch); } #endif +#ifdef CONFIG_QOS_SCHED +void sched_move_offline_task(struct task_struct *p); +#endif + #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4d42f0cbc11e..d65411063781 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3013,6 +3013,28 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked ss->post_attach(); }
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk) +{ + struct css_set *css; + struct cgroup *cpu_cgrp; + struct cgroup *cpu_root_cgrp; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + + spin_lock_irq(&css_set_lock); + css = task_css_set(tsk); + cpu_cgrp = css->subsys[cpu_cgrp_id]->cgroup; + cpu_root_cgrp = &cpu_cgrp->root->cgrp; + spin_unlock_irq(&css_set_lock); + + (void)cgroup_attach_task(cpu_root_cgrp, tsk, false); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); +} +#endif + static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac7abb6c5c64..2309a14e81de 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10347,6 +10347,38 @@ static void sched_change_qos_group(struct task_struct *tsk, struct task_group *t __setscheduler_prio(tsk, normal_prio(tsk)); } } + +struct offline_args { + struct work_struct work; + struct task_struct *p; +}; + +static void sched_move_work(struct work_struct *work) +{ + struct sched_param param = { .sched_priority = 0 }; + struct offline_args *args = container_of(work, struct offline_args, work); + + cgroup_move_task_to_root(args->p); + sched_setscheduler(args->p, SCHED_NORMAL, ¶m); + put_task_struct(args->p); + kfree(args); +} + +void sched_move_offline_task(struct task_struct *p) +{ + struct offline_args *args; + + if (unlikely(task_group(p)->qos_level != -1)) + return; + + args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC); + if (args) { + get_task_struct(p); + args->p = p; + INIT_WORK(&args->work, sched_move_work); + queue_work(system_highpri_wq, &args->work); + } +} #endif
static inline void alloc_uclamp_sched_group(struct task_group *tg, diff --git a/kernel/signal.c b/kernel/signal.c index 2547fa73bde5..e50383449808 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1054,6 +1054,9 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) signal->group_stop_count = 0; t = p; do { +#ifdef CONFIG_QOS_SCHED + sched_move_offline_task(t); +#endif task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1);
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
In cpu hotplug case, when a cpu go to offline, we should unthrottle cfs_rq which be throttled on this cpu, so they will be migrated to online cpu.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94290eefba36..eff81e480604 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -166,6 +166,7 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static int unthrottle_qos_cfs_rqs(int cpu); #endif
#ifdef CONFIG_CFS_BANDWIDTH @@ -6141,6 +6142,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); + +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif }
#else /* CONFIG_CFS_BANDWIDTH */
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
When freeing a taskgroup, we will free cfs rqs of the group, even if cfs rqs have been throttled, otherwise it will cause a UAF Bug. Therefore before freeing a taskgroup, we should unthrottle all cfs rqs belonging to the taskgroup.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eff81e480604..fee251393313 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8222,6 +8222,17 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
return false; } + +static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq)) + unthrottle_qos_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); +} #endif
#ifdef CONFIG_SMP @@ -12653,6 +12664,10 @@ void free_fair_sched_group(struct task_group *tg) int i;
for_each_possible_cpu(i) { +#ifdef CONFIG_QOS_SCHED + if (tg->cfs_rq) + unthrottle_qos_sched_group(tg->cfs_rq[i]); +#endif if (tg->cfs_rq) kfree(tg->cfs_rq[i]); if (tg->se)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2309a14e81de..b82bddac1352 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7687,8 +7687,8 @@ static int __sched_setscheduler(struct task_struct *p, * isolation will be invalid, so return -EINVAL in this case. */ if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { - task_rq_unlock(rq, p, &rf); - return -EINVAL; + retval = -EINVAL; + goto unlock; } #endif
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
When online tasks occupy cpu long time, offline task will not get cpu to run, the priority inversion issue may be triggered in this case. If the above case occurs, we will unthrottle offline tasks and let its get a chance to run. When online tasks occupy cpu over 5s(defaule value), we will unthrottle offline tasks and enter a msleep loop before exit to usermode util the cpu goto idle.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/sched.h | 7 +++ include/linux/sched/sysctl.h | 5 +++ kernel/entry/common.c | 7 ++- kernel/sched/core.c | 3 ++ kernel/sched/fair.c | 84 ++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 4 ++ kernel/sysctl.c | 24 +++++++++++ 7 files changed, 130 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b64333aa7af..e92c57841135 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2468,6 +2468,13 @@ static inline bool dynamic_affinity_enabled(void) #endif #ifdef CONFIG_QOS_SCHED void sched_move_offline_task(struct task_struct *p); +void sched_qos_offline_wait(void); +int sched_qos_cpu_overload(void); +#else +static inline int sched_qos_cpu_overload(void) +{ + return 0; +} #endif
#endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index ede157a678f8..28d9be8e4614 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -32,4 +32,9 @@ extern int sysctl_numa_balancing_mode; #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY extern int sysctl_sched_util_low_pct; #endif +#ifdef CONFIG_QOS_SCHED +extern unsigned int sysctl_overload_detect_period; +extern unsigned int sysctl_offline_wait_interval; +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index be61332c66b5..e3df7fdfd901 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -170,6 +170,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(regs);
+#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work);
@@ -200,7 +204,8 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare();
ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || + sched_qos_cpu_overload())) ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b82bddac1352..f72cd213a784 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9981,6 +9981,9 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ +#ifdef CONFIG_QOS_SCHED + init_qos_hrtimer(i); +#endif init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fee251393313..1fe68608877a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -56,6 +56,10 @@ #include "stats.h" #include "autogroup.h"
+#ifdef CONFIG_QOS_SCHED +#include <linux/delay.h> +#endif + /* * Targeted preemption latency for CPU-bound tasks: * @@ -166,6 +170,10 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); +static DEFINE_PER_CPU(int, qos_cpu_overload); +unsigned int sysctl_overload_detect_period = 5000; /* in ms */ +unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif
@@ -8107,6 +8115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED +static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -8140,6 +8149,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta);
+ if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + start_qos_hrtimer(cpu_of(rq)); + cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq);
@@ -8195,7 +8207,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); }
-static int unthrottle_qos_cfs_rqs(int cpu) +static int __unthrottle_qos_cfs_rqs(int cpu) { struct cfs_rq *cfs_rq, *tmp_rq; int res = 0; @@ -8211,11 +8223,25 @@ static int unthrottle_qos_cfs_rqs(int cpu) return res; }
+static int unthrottle_qos_cfs_rqs(int cpu) +{ + int res; + + res = __unthrottle_qos_cfs_rqs(cpu); + if (res) + hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + + return res; +} + static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) { + if (unlikely(__this_cpu_read(qos_cpu_overload))) + return false; + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && - !sched_idle_cpu(smp_processor_id()) && - cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { throttle_qos_cfs_rq(cfs_rq); return true; } @@ -8233,6 +8259,56 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) unthrottle_qos_cfs_rq(cfs_rq); rq_unlock_irqrestore(rq, &rf); } + +void sched_qos_offline_wait(void) +{ + long qos_level; + + while (unlikely(this_cpu_read(qos_cpu_overload))) { + rcu_read_lock(); + qos_level = task_group(current)->qos_level; + rcu_read_unlock(); + if (qos_level != -1 || signal_pending(current)) + break; + msleep_interruptible(sysctl_offline_wait_interval); + } +} + +int sched_qos_cpu_overload(void) +{ + return __this_cpu_read(qos_cpu_overload); +} + +static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) +{ + struct rq_flags rf; + struct rq *rq = this_rq(); + + rq_lock_irqsave(rq, &rf); + if (__unthrottle_qos_cfs_rqs(smp_processor_id())) + __this_cpu_write(qos_cpu_overload, 1); + rq_unlock_irqrestore(rq, &rf); + + return HRTIMER_NORESTART; +} + +static void start_qos_hrtimer(int cpu) +{ + ktime_t time; + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period); + hrtimer_set_expires(hrtimer, time); + hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED); +} + +void init_qos_hrtimer(int cpu) +{ + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer->function = qos_overload_timer_handler; +} #endif
#ifdef CONFIG_SMP @@ -8418,6 +8494,8 @@ done: __maybe_unused; rq->idle_stamp = 0; goto again; } + + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5a8afed85a10..feafe8661075 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1401,6 +1401,10 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0)
+#ifdef CONFIG_QOS_SCHED +void init_qos_hrtimer(int cpu); +#endif + #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index acc20b417dc8..e9af234bf882 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); static const int six_hundred_forty_kb = 640 * 1024; #endif
+#ifdef CONFIG_QOS_SCHED +static int one_thousand = 1000; +static int hundred_thousand = 100000; +#endif
static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2045,6 +2049,26 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#ifdef CONFIG_QOS_SCHED + { + .procname = "qos_overload_detect_period_ms", + .data = &sysctl_overload_detect_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &hundred_thousand, + }, + { + .procname = "qos_offline_wait_interval_ms", + .data = &sysctl_offline_wait_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &one_thousand, + }, +#endif { .procname = "max_rcu_stall_to_panic", .data = &sysctl_max_rcu_stall_to_panic,
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index f1d585964710..e4b25c1f0a82 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -176,6 +176,7 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_MISC is not set +CONFIG_QOS_SCHED=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_NAMESPACES=y
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
cfs_bandwidth_usage_inc() need hold jump_label_mutex and might sleep, so we can not call it in atomic context. Fix this by moving cfs_bandwidth_usage_{inc,dec}() out of rcu read critical section.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/core.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f72cd213a784..08ce8aada0b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11242,13 +11242,10 @@ static int tg_change_scheduler(struct task_group *tg, void *data) struct cgroup_subsys_state *css = &tg->css;
tg->qos_level = qos_level; - if (qos_level == -1) { + if (qos_level == -1) policy = SCHED_IDLE; - cfs_bandwidth_usage_inc(); - } else { + else policy = SCHED_NORMAL; - cfs_bandwidth_usage_dec(); - }
param.sched_priority = 0; css_task_iter_start(css, 0, &it); @@ -11276,6 +11273,13 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, if (tg->qos_level == -1 && qos_level == 0) return -EINVAL;
+ cpus_read_lock(); + if (qos_level == -1) + cfs_bandwidth_usage_inc(); + else + cfs_bandwidth_usage_dec(); + cpus_read_unlock(); + rcu_read_lock(); walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); rcu_read_unlock();
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
-----------------------------------------------------------------
when unthrottle a cfs_rq at distribute_cfs_runtime(), another cpu may re-throttle this cfs_rq at qos_throttle_cfs_rq() before access the cfs_rq->throttle_list.next, but meanwhile, qos throttle will attach the cfs_rq throttle_list node to percpu qos_throttled_cfs_rq, it will change cfs_rq->throttle_list.next and cause panic or hardlockup at distribute_cfs_runtime().
Fix it by adding a qos_throttle_list node in struct cfs_rq, and qos throttle disuse the cfs_rq->throttle_list.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 10 +++++++--- kernel/sched/sched.h | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1fe68608877a..cf75caae5911 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6045,6 +6045,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); #endif +#ifdef CONFIG_QOS_SCHED + INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); +#endif }
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -8155,7 +8158,8 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq);
- list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); + list_add(&cfs_rq->qos_throttled_list, + &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); }
static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) @@ -8173,7 +8177,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq);
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; - list_del_init(&cfs_rq->throttled_list); + list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -8213,7 +8217,7 @@ static int __unthrottle_qos_cfs_rqs(int cpu) int res = 0;
list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), - throttled_list) { + qos_throttled_list) { if (cfs_rq_throttled(cfs_rq)) { unthrottle_qos_cfs_rq(cfs_rq); res++; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index feafe8661075..7393c1a62513 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -654,6 +654,10 @@ struct cfs_rq { #endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ + +#if defined(CONFIG_QOS_SCHED) + struct list_head qos_throttled_list; +#endif };
static inline int rt_bandwidth_enabled(void)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Before, when detect the cpu is overloaded, we throttle offline tasks at exit_to_user_mode_loop() before returning to user mode. Some architects(e.g.,arm64) do not support QOS scheduler because a task do not via exit_to_user_mode_loop() return to userspace at these platforms. In order to slove this problem and support qos scheduler on all architectures, if we require throttling offline tasks, we set flag TIF_NOTIFY_RESUME to an offline task when it is picked and throttle it at tracehook_notify_resume().
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/resume_user_mode.h | 5 +++++ kernel/entry/common.c | 7 +------ kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index 285189454449..a868cb5f4303 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,6 +59,11 @@ static inline void resume_user_mode_work(struct pt_regs *regs) blkcg_maybe_throttle_current();
rseq_handle_notify_resume(NULL, regs); + +#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + }
#endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e3df7fdfd901..be61332c66b5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -170,10 +170,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(regs);
-#ifdef CONFIG_QOS_SCHED - sched_qos_offline_wait(); -#endif - /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work);
@@ -204,8 +200,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare();
ti_work = read_thread_flags(); - if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || - sched_qos_cpu_overload())) + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf75caae5911..ecb9dab2d1ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -58,6 +58,7 @@
#ifdef CONFIG_QOS_SCHED #include <linux/delay.h> +#include <linux/resume_user_mode.h> #endif
/* @@ -8118,6 +8119,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED + +static inline bool is_offline_task(struct task_struct *p) +{ + return task_group(p)->qos_level == QOS_LEVEL_OFFLINE; +} + static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { @@ -8272,9 +8279,10 @@ void sched_qos_offline_wait(void) rcu_read_lock(); qos_level = task_group(current)->qos_level; rcu_read_unlock(); - if (qos_level != -1 || signal_pending(current)) + if (qos_level != -1 || fatal_signal_pending(current)) break; - msleep_interruptible(sysctl_offline_wait_interval); + + schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval)); } }
@@ -8313,6 +8321,23 @@ void init_qos_hrtimer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); hrtimer->function = qos_overload_timer_handler; } + +/* + * To avoid Priority inversion issues, when this cpu is qos_cpu_overload, + * we should schedule offline tasks to run so that they can leave kernel + * critical sections, and throttle them before returning to user mode. + */ +static void qos_schedule_throttle(struct task_struct *p) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (unlikely(this_cpu_read(qos_cpu_overload))) { + if (is_offline_task(p)) + set_notify_resume(p); + } +} + #endif
#ifdef CONFIG_SMP @@ -8474,6 +8499,10 @@ done: __maybe_unused;
update_misfit_status(p, rq);
+#ifdef CONFIG_QOS_SCHED + qos_schedule_throttle(p); +#endif + return p;
idle:
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N... 失败原因:补丁集缺失封面信息 建议解决方法:请提供补丁集并重新发送您的补丁集到邮件列表
FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N... Failed Reason: the cover of the patches is missing Suggest Solution: please checkout and apply the patches' cover and send all again
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N... 失败原因:补丁集缺失封面信息 建议解决方法:请提供补丁集并重新发送您的补丁集到邮件列表
FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N... Failed Reason: the cover of the patches is missing Suggest Solution: please checkout and apply the patches' cover and send all again
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
1. Qos throttle reuse tg_{throttle,unthrottle}_{up,down} that can write some cfs-bandwidth fields, it may cause some unknown data error. So add qos_tg_{throttle,unthrottle}_{up,down} for qos throttle.
2. walk_tg_tree_from() caller must hold rcu_lock, currently there is none, so add it now.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecb9dab2d1ed..bf5aa616ee0d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6132,6 +6132,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif + rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; @@ -6154,10 +6158,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); - -#ifdef CONFIG_QOS_SCHED - unthrottle_qos_cfs_rqs(cpu_of(rq)); -#endif }
#else /* CONFIG_CFS_BANDWIDTH */ @@ -8126,6 +8126,27 @@ static inline bool is_offline_task(struct task_struct *p) }
static void start_qos_hrtimer(int cpu); + +static int qos_tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; + + return 0; +} + +static int qos_tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count++; + + return 0; +} + static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -8136,7 +8157,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock();
task_delta = cfs_rq->h_nr_running; @@ -8163,7 +8184,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) start_qos_hrtimer(cpu_of(rq));
cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq);
list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -8172,7 +8192,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; long task_delta, idle_task_delta; @@ -8181,13 +8200,12 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
- update_rq_clock(rq); - - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */ - walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq); + rcu_read_unlock();
if (!cfs_rq->load.weight) return;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
------------[ cut here]------------ rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 5 PID: 3312 at kernel/sched/sched.h:1223 update_curr+0x1e5/0x210 CPU: 5 PID: 3312 Comm: a.out Tainted: G S5.10.0.zq+ #1 Hardware name: Huawei RH2288H V3/BC11HGSA0, BIOS 3.35 10/20/2016 RIP: 0010:update_curr+0x1e5/0x210 enqueue_entity+0x378/0xd00 unthrottle_qos_cfs_rq+0x1bc/0x2a0 __unthrottle_qos_cfs_rqs+0x87/0xa0 qos_overload_timer_handler+0x35/0x60 __run_hrtimer+0x5e/0x190 __hrtimer_run_queues+0x81/0xe0 hrtimer_interrupt+0x110/0x2c0 __sysvec_apic_timer_interrupt+0x5f/0xd0 sysvec_apic_timer_interrupt+0x31/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20
After the last rq_pin_lock(), there is no rq clock update before calling enqueue_entity() at unthrottle_qos_cfs_rq();
This patch fixes it by updating rq clock before calling enqueue_entity().
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf5aa616ee0d..2add3d9c19b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8200,6 +8200,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
+ update_rq_clock(rq); list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
-------------------------------
When a cfs_rq throttled by qos, mark cfs_rq->throttled as 1, and cfs bw will unthrottled this cfs_rq by mistake, it cause a list_del_valid warning. So add macro QOS_THROTTLED(=2), when a cfs_rq is throttled by qos, we mark the cfs_rq->throttled as QOS_THROTTLED, will check the value of cfs_rq->throttled before unthrottle a cfs_rq.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 139 ++++++++++++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 49 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2add3d9c19b7..bc58182b201f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -170,6 +170,13 @@ int __weak arch_asym_cpu_priority(int cpu) #endif
#ifdef CONFIG_QOS_SCHED + +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); @@ -5507,6 +5514,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq)];
+#ifdef CONFIG_QOS_SCHED + /* + * if this cfs_rq throttled by qos, not need unthrottle it. + */ + if (cfs_rq->throttled == QOS_THROTTLED) + return; +#endif + cfs_rq->throttled = 0;
update_rq_clock(rq); @@ -8122,42 +8137,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
static inline bool is_offline_task(struct task_struct *p) { - return task_group(p)->qos_level == QOS_LEVEL_OFFLINE; + return task_group(p)->qos_level == -1; }
static void start_qos_hrtimer(int cpu);
-static int qos_tg_unthrottle_up(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - cfs_rq->throttle_count--; - - return 0; -} - -static int qos_tg_throttle_down(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - cfs_rq->throttle_count++; - - return 0; -} - static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; + long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock();
task_delta = cfs_rq->h_nr_running; @@ -8166,24 +8161,44 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; + + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta;
- if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } }
- if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta);
+done: if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) start_qos_hrtimer(cpu_of(rq));
- cfs_rq->throttled = 1; + cfs_rq->throttled = QOS_THROTTLED;
list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -8193,11 +8208,13 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - int enqueue = 1; long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
+ if (cfs_rq->throttled != QOS_THROTTLED) + return; + cfs_rq->throttled = 0;
update_rq_clock(rq); @@ -8205,32 +8222,58 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); rcu_read_unlock();
- if (!cfs_rq->load.weight) - return; + if (!cfs_rq->load.weight) { + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; + }
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) - enqueue = 0; + break;
cfs_rq = cfs_rq_of(se); - if (enqueue) - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta;
if (cfs_rq_throttled(cfs_rq)) - break; + goto unthrottle_throttle; }
- assert_list_leaf_cfs_rq(rq); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se);
- if (!se) - add_nr_running(rq, task_delta); + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + add_nr_running(rq, task_delta); + +unthrottle_throttle: + + assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -13172,13 +13215,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void) { -#ifdef CONFIG_QOS_SCHED - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); -#endif - #ifdef CONFIG_SMP int i;
@@ -13192,6 +13228,11 @@ __init void init_sched_fair_class(void) #endif }
+#ifdef CONFIG_QOS_SCHED + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); +#endif + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX CVE: NA
--------------------------------
Enable CONFIG_QOS_SCHED to support qos scheduler.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- arch/x86/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 67ad5cb5ba84..250e686c8fb2 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -185,6 +185,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y +CONFIG_QOS_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y