We introduce the qos smt expeller, which lets online tasks to expel offline tasks on the smt sibling cpus, and exclusively occupy CPU resources.In this way we are able to improve QOS of online tasks in co-location.
Guan Jing (8): sched: Introduce qos smt expeller for co-location sched: Implement the function of qos smt expeller sched: Add statistics for qos smt expeller sched: Add tracepoint for qos smt expeller config: Enable CONFIG_QOS_SCHED_SMT_EXPELLER sched/fair: Start tracking qos_offline tasks count in cfs_rq sched/fair: Introduce QOS_SMT_EXPELL priority reversion mechanism sched/fair: Add cmdline nosmtexpell
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/sched.h | 13 + include/trace/events/sched.h | 55 ++++ init/Kconfig | 9 + kernel/sched/debug.c | 5 + kernel/sched/fair.c | 345 +++++++++++++++++++++++-- kernel/sched/sched.h | 27 ++ 8 files changed, 440 insertions(+), 16 deletions(-)
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY CVE: NA
--------------------------------
We introduce the qos smt expeller, which lets online tasks to expel offline tasks on the smt sibling cpus, and exclusively occupy CPU resources.In this way we are able to improve QOS of online tasks in co-location.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- init/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 2ee1384c4f81..aef244c7d321 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1002,6 +1002,15 @@ config QOS_SCHED
If in doubt, say N.
+config QOS_SCHED_SMT_EXPELLER + bool "Qos smt expeller" + depends on SCHED_SMT + depends on QOS_SCHED + default n + help + This feature enable online tasks to expel offline tasks + on the smt sibling cpus, and exclusively occupy CPU resources. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY CVE: NA
--------------------------------
We implement the function of qos smt expeller by this following two points: a) when online tasks and offline tasks are running on the same physical cpu, online tasks will send ipi to expel offline tasks on the smt sibling cpus. b) when online tasks are running, the smt sibling cpus will not allow offline tasks to be selected.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- include/linux/sched.h | 7 ++ kernel/sched/fair.c | 189 +++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 5 ++ 3 files changed, 199 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 3520e3fbaa91..b84b37b0ddff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2011,9 +2011,16 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); __get_task_comm(buf, sizeof(buf), tsk); \ })
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +void qos_smt_check_need_resched(void); +#endif + #ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_check_need_resched(); +#endif /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0de55884f9da..dc6d284e97cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -62,6 +62,10 @@ #include <linux/resume_user_mode.h> #endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +#include <trace/events/ipi.h> +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -147,6 +151,10 @@ static int hundred_thousand = 100000; static int unthrottle_qos_cfs_rqs(int cpu); #endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static DEFINE_PER_CPU(int, qos_smt_status); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -8565,6 +8573,131 @@ static void qos_schedule_throttle(struct task_struct *p)
#endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static bool qos_smt_check_siblings_status(int this_cpu) +{ + int cpu; + + if (!sched_smt_active()) + return false; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE) + return true; + } + + return false; +} + +static bool qos_smt_expelled(int this_cpu) +{ + /* + * The qos_smt_status of siblings cpu is online, and current cpu only has + * offline tasks enqueued, there is not suitable task, + * so pick_next_task_fair return null. + */ + if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu)) + return true; + + return false; +} + +static bool qos_smt_update_status(struct task_struct *p) +{ + int status = QOS_LEVEL_OFFLINE; + + if (p != NULL && task_group(p)->qos_level >= QOS_LEVEL_ONLINE) + status = QOS_LEVEL_ONLINE; + + if (__this_cpu_read(qos_smt_status) == status) + return false; + + __this_cpu_write(qos_smt_status, status); + + return true; +} + +static void qos_smt_send_ipi(int this_cpu) +{ + int cpu; + struct rq *rq = NULL; + + if (!sched_smt_active()) + return; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + rq = cpu_rq(cpu); + + /* + * There are two cases where current don't need to send scheduler_ipi: + * a) The qos_smt_status of siblings cpu is online; + * b) The cfs.h_nr_running of siblings cpu is 0. + */ + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE || + rq->cfs.h_nr_running == 0) + continue; + + smp_send_reschedule(cpu); + } +} + +static void qos_smt_expel(int this_cpu, struct task_struct *p) +{ + if (qos_smt_update_status(p)) + qos_smt_send_ipi(this_cpu); +} + +static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) +{ + int cpu; + + if (!sched_smt_active()) + return false; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + /* + * There are two cases rely on the set need_resched to drive away + * offline task: + * a) The qos_smt_status of siblings cpu is online, the task of current cpu is offline; + * b) The qos_smt_status of siblings cpu is offline, the task of current cpu is idle, + * and current cpu only has SCHED_IDLE tasks enqueued. + */ + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE && + task_group(current)->qos_level < QOS_LEVEL_ONLINE) + return true; + + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && + rq->curr == rq->idle && sched_idle_cpu(this_cpu)) + return true; + } + + return false; +} + +void qos_smt_check_need_resched(void) +{ + struct rq *rq = this_rq(); + int this_cpu = rq->cpu; + + if (test_tsk_need_resched(current)) + return; + + if (_qos_smt_check_need_resched(this_cpu, rq)) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } +} +#endif + #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { @@ -8605,14 +8738,30 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct sched_entity *se; struct task_struct *p; int new_tasks; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int this_cpu = rq->cpu; +#endif
again: +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + if (qos_smt_expelled(this_cpu)) { + __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + return NULL; + } +#endif + if (!sched_fair_runnable(rq)) goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) - goto simple; + if (!prev || prev->sched_class != &fair_sched_class) { +#ifdef CONFIG_QOS_SCHED + if (cfs_rq->idle_h_nr_running != 0 && rq->online) + goto qos_simple; + else +#endif + goto simple; + }
/* * Because of the set_next_buddy() in dequeue_task_fair() it is rather @@ -8696,6 +8845,34 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf }
goto done; + +#ifdef CONFIG_QOS_SCHED +qos_simple: + if (prev) + put_prev_task(rq, prev); + + do { + se = pick_next_entity(cfs_rq, NULL); + if (check_qos_cfs_rq(group_cfs_rq(se))) { + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_running) + goto idle; + continue; + } + + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + while (se) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + + goto done; +#endif + simple: #endif if (prev) @@ -8729,6 +8906,10 @@ done: __maybe_unused; qos_schedule_throttle(p); #endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_expel(this_cpu, p); +#endif + return p;
idle: @@ -8762,6 +8943,10 @@ done: __maybe_unused; */ update_idle_rq_clock_pelt(rq);
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_expel(this_cpu, NULL); +#endif + return NULL; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3de84e95baf1..4cf2f39e143a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1412,6 +1412,11 @@ do { \ } while (0)
#ifdef CONFIG_QOS_SCHED +enum task_qos_level { + QOS_LEVEL_OFFLINE = -1, + QOS_LEVEL_ONLINE = 0, + QOS_LEVEL_MAX +}; void init_qos_hrtimer(int cpu); #endif
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY CVE: NA
--------------------------------
We have added two statistics for qos smt expeller: a) nr_qos_smt_send_ipi:the times of ipi which online task expel offline tasks; b) nr_qos_smt_expelled:the statistics that offline task will not be picked times.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- include/linux/sched.h | 6 ++++++ kernel/sched/debug.c | 5 +++++ kernel/sched/fair.c | 2 ++ 3 files changed, 13 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index b84b37b0ddff..b24626cc52ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -543,6 +543,12 @@ struct sched_statistics { #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif + +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + u64 nr_qos_smt_send_ipi; + u64 nr_qos_smt_expelled; +#endif + #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6..61b28d61b49e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1040,6 +1040,11 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle);
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + P_SCHEDSTAT(nr_qos_smt_send_ipi); + P_SCHEDSTAT(nr_qos_smt_expelled); +#endif + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc6d284e97cd..98e2a18d6699 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8643,6 +8643,7 @@ static void qos_smt_send_ipi(int this_cpu) rq->cfs.h_nr_running == 0) continue;
+ schedstat_inc(current->stats.nr_qos_smt_send_ipi); smp_send_reschedule(cpu); } } @@ -8746,6 +8747,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER if (qos_smt_expelled(this_cpu)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); return NULL; } #endif
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY CVE: NA
--------------------------------
There are two caces that we add tracepoint: a) while online task of sibling cpu is running, it is running that offline task of local cpu will be set TIF_NEED_RESCHED; b) while online task of sibling cpu is running, it will expell that next picked offline task of local cpu.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- include/trace/events/sched.h | 55 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 9 ++++-- 2 files changed, 62 insertions(+), 2 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbb99a61f714..4bafb70dfafc 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -268,6 +268,61 @@ TRACE_EVENT(sched_switch, __entry->next_comm, __entry->next_pid, __entry->next_prio) );
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +/* + * Tracepoint for a offline task being resched: + */ +TRACE_EVENT(sched_qos_smt_expel, + + TP_PROTO(struct task_struct *sibling_p, int qos_smt_status), + + TP_ARGS(sibling_p, qos_smt_status), + + TP_STRUCT__entry( + __array( char, sibling_comm, TASK_COMM_LEN ) + __field( pid_t, sibling_pid ) + __field( int, sibling_qos_status ) + __field( int, sibling_cpu ) + ), + + TP_fast_assign( + memcpy(__entry->sibling_comm, sibling_p->comm, TASK_COMM_LEN); + __entry->sibling_pid = sibling_p->pid; + __entry->sibling_qos_status = qos_smt_status; + __entry->sibling_cpu = task_cpu(sibling_p); + ), + + TP_printk("sibling_comm=%s sibling_pid=%d sibling_qos_status=%d sibling_cpu=%d", + __entry->sibling_comm, __entry->sibling_pid, __entry->sibling_qos_status, + __entry->sibling_cpu) +); + +/* + * Tracepoint for a offline task being expelled: + */ +TRACE_EVENT(sched_qos_smt_expelled, + + TP_PROTO(struct task_struct *p, int qos_smt_status), + + TP_ARGS(p, qos_smt_status), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, qos_status ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->qos_status = qos_smt_status; + ), + + TP_printk("comm=%s pid=%d qos_status=%d", + __entry->comm, __entry->pid, __entry->qos_status) +); +#endif + /* * Tracepoint for a task being migrated: */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 98e2a18d6699..c413b3515a56 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8673,12 +8673,16 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) * and current cpu only has SCHED_IDLE tasks enqueued. */ if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE && - task_group(current)->qos_level < QOS_LEVEL_ONLINE) + task_group(current)->qos_level < QOS_LEVEL_ONLINE) { + trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; + }
if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && - rq->curr == rq->idle && sched_idle_cpu(this_cpu)) + rq->curr == rq->idle && sched_idle_cpu(this_cpu)) { + trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; + } }
return false; @@ -8748,6 +8752,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf if (qos_smt_expelled(this_cpu)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); + trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu)); return NULL; } #endif
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY CVE: NA
--------------------------------
Enable CONFIG_QOS_SCHED_SMT_EXPELLER.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 33ba39711884..d92d95a1b59b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -164,6 +164,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 44040b835333..7bfce92a96ac 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -186,6 +186,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY
-------------------------------
Track how many tasks are present with qos_offline_policy in each cfs_rq. This will be used by later commits.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- kernel/sched/fair.c | 82 +++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 22 ++++++++++++ 2 files changed, 99 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c413b3515a56..92678934aa70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5703,6 +5703,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif
raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5734,6 +5737,10 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif + for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -5747,6 +5754,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif
if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5769,6 +5779,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif }
/* At this point se is NULL and we are at root level*/ @@ -5792,6 +5805,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5834,6 +5850,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -5846,6 +5865,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -5863,6 +5885,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6682,6 +6707,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int qos_idle_h_nr_running = task_has_qos_idle_policy(p); +#endif int task_new = !(flags & ENQUEUE_WAKEUP);
/* @@ -6708,6 +6736,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; +#endif
if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6728,7 +6759,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1;
@@ -6776,6 +6809,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int qos_idle_h_nr_running = task_has_qos_idle_policy(p); +#endif bool was_sched_idle = sched_idle_rq(rq);
util_est_dequeue(&rq->cfs, p); @@ -6786,6 +6822,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; +#endif
if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6818,7 +6857,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1;
@@ -8314,7 +8355,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */ @@ -8324,6 +8367,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -8334,6 +8380,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif
if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -8356,6 +8405,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif }
/* At this point se is NULL and we are at root level*/ @@ -8376,6 +8428,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -8408,6 +8463,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { if (se->on_rq) break; @@ -8417,6 +8475,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8430,6 +8491,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -8592,6 +8656,14 @@ static bool qos_smt_check_siblings_status(int this_cpu) return false; }
+static bool qos_sched_idle_cpu(int this_cpu) +{ + struct rq *rq = cpu_rq(this_cpu); + + return unlikely(rq->nr_running == rq->cfs.qos_idle_h_nr_running && + rq->nr_running); +} + static bool qos_smt_expelled(int this_cpu) { /* @@ -8599,7 +8671,7 @@ static bool qos_smt_expelled(int this_cpu) * offline tasks enqueued, there is not suitable task, * so pick_next_task_fair return null. */ - if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu)) + if (qos_smt_check_siblings_status(this_cpu) && qos_sched_idle_cpu(this_cpu)) return true;
return false; @@ -8679,7 +8751,7 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) }
if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && - rq->curr == rq->idle && sched_idle_cpu(this_cpu)) { + rq->curr == rq->idle && qos_sched_idle_cpu(this_cpu)) { trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4cf2f39e143a..22d909c8921d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -657,6 +657,14 @@ struct cfs_rq { #if defined(CONFIG_QOS_SCHED) struct list_head qos_throttled_list; #endif + +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + union { + unsigned int qos_idle_h_nr_running; /* qos_level:-1 */ + unsigned long qos_idle_h_nr_running_padding; + }; +#endif + };
static inline int rt_bandwidth_enabled(void) @@ -3286,6 +3294,20 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } #endif
+#ifdef CONFIG_QOS_SCHED +static inline int qos_idle_policy(int policy) +{ + return policy == QOS_LEVEL_OFFLINE; +} +#endif + +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static inline int task_has_qos_idle_policy(struct task_struct *p) +{ + return qos_idle_policy(task_group(p)->qos_level) && p->policy == SCHED_IDLE; +} +#endif + extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY
-------------------------------
Here is the typical case that priority inversion will caused occasionally by SMT expelling: Assuming that there are two SMT cores-cA and cB, online tasks are running on cA while offline tasks on cB. With SMT expelling, online task will drives off offline tasks to occupy all SMT cores exclusively, which, in turn, will starve the offline task to release the related resources other tasks with higher priority need.
Hence, this patch will introduce another mechanism to alleviate this situation. For all offline tasks, one metric of profiling the maximum task expelling duration is set up and the default value is 5 seconds, if such offline task exists, all offline tasks will be allowed to run into one small sleep(msleep) loop in kernel before they goes into usermode; and further, if the two SMT cores(such as cA and cB) are idle or don't get any online tasks to run, for these offline tasks, they will continue to run in usermode for the next schedule.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 92678934aa70..99a0521b6671 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -149,6 +149,7 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int one_thousand = 1000; static int hundred_thousand = 100000; static int unthrottle_qos_cfs_rqs(int cpu); +static bool qos_smt_expelled(int this_cpu); #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -8343,6 +8344,16 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED +static inline bool qos_timer_is_activated(int cpu) +{ + return hrtimer_active(per_cpu_ptr(&qos_overload_timer, cpu)); +} + +static inline void cancel_qos_timer(int cpu) +{ + hrtimer_cancel(per_cpu_ptr(&qos_overload_timer, cpu)); +} + static inline bool is_offline_task(struct task_struct *p) { return task_group(p)->qos_level == -1; @@ -8414,7 +8425,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) sub_nr_running(rq, task_delta);
done: - if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + if (!qos_timer_is_activated(cpu_of(rq))) start_qos_hrtimer(cpu_of(rq));
cfs_rq->throttled = QOS_THROTTLED; @@ -8505,10 +8516,6 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) unthrottle_throttle:
assert_list_leaf_cfs_rq(rq); - - /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_curr(rq); }
static int __unthrottle_qos_cfs_rqs(int cpu) @@ -8530,11 +8537,10 @@ static int __unthrottle_qos_cfs_rqs(int cpu) static int unthrottle_qos_cfs_rqs(int cpu) { int res; - res = __unthrottle_qos_cfs_rqs(cpu); - if (res) - hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu)));
+ if (qos_timer_is_activated(cpu) && !qos_smt_expelled(cpu)) + cancel_qos_timer(cpu); return res; }
@@ -8594,8 +8600,13 @@ static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) struct rq *rq = this_rq();
rq_lock_irqsave(rq, &rf); - if (__unthrottle_qos_cfs_rqs(smp_processor_id())) - __this_cpu_write(qos_cpu_overload, 1); + __unthrottle_qos_cfs_rqs(smp_processor_id()); + __this_cpu_write(qos_cpu_overload, 1); + + /* Determine whether we need to wake up potentially idle CPU. */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); + rq_unlock_irqrestore(rq, &rf);
return HRTIMER_NORESTART; @@ -8635,6 +8646,13 @@ static void qos_schedule_throttle(struct task_struct *p) } }
+#ifndef CONFIG_QOS_SCHED_SMT_EXPELLER +static bool qos_smt_expelled(int this_cpu) +{ + return false; +} +#endif + #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -8821,8 +8839,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
again: #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - if (qos_smt_expelled(this_cpu)) { + if (qos_smt_expelled(this_cpu) && !__this_cpu_read(qos_cpu_overload)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + + if (!qos_timer_is_activated(this_cpu)) + start_qos_hrtimer(this_cpu); + schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu)); return NULL; @@ -9014,7 +9036,8 @@ done: __maybe_unused; goto again; }
- __this_cpu_write(qos_cpu_overload, 0); + if (!qos_smt_expelled(cpu_of(rq))) + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8O3MY
-------------------------------
Add cmdline nosmtexpell to disable qos_smt_expell when we want to close.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- kernel/sched/fair.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 99a0521b6671..be12f66d959b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8656,6 +8656,15 @@ static bool qos_smt_expelled(int this_cpu) #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +DEFINE_STATIC_KEY_TRUE(qos_smt_expell_switch); + +static int __init qos_sched_smt_noexpell_setup(char *__unused) +{ + static_branch_disable(&qos_smt_expell_switch); + return 1; +} +__setup("nosmtexpell", qos_sched_smt_noexpell_setup); + static bool qos_smt_check_siblings_status(int this_cpu) { int cpu; @@ -8684,6 +8693,9 @@ static bool qos_sched_idle_cpu(int this_cpu)
static bool qos_smt_expelled(int this_cpu) { + if (!static_branch_likely(&qos_smt_expell_switch)) + return false; + /* * The qos_smt_status of siblings cpu is online, and current cpu only has * offline tasks enqueued, there is not suitable task, @@ -8740,15 +8752,29 @@ static void qos_smt_send_ipi(int this_cpu)
static void qos_smt_expel(int this_cpu, struct task_struct *p) { + if (!static_branch_likely(&qos_smt_expell_switch)) + return; + if (qos_smt_update_status(p)) qos_smt_send_ipi(this_cpu); }
+static inline bool qos_smt_enabled(void) +{ + if (!static_branch_likely(&qos_smt_expell_switch)) + return false; + + if (!sched_smt_active()) + return false; + + return true; +} + static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) { int cpu;
- if (!sched_smt_active()) + if (!qos_smt_enabled()) return false;
for_each_cpu(cpu, cpu_smt_mask(this_cpu)) {
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3635 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/K...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3635 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/K...