We introduce the qos smt expeller, which lets online tasks to expel offline tasks on the smt sibling cpus, and exclusively occupy CPU resources.In this way we are able to improve QOS of online tasks in co-location.
Xia Fukun (8): sched: Introduce qos smt expeller for co-location sched: Implement the function of qos smt expeller sched: Add statistics for qos smt expeller sched: Add tracepoint for qos smt expeller config: Enable CONFIG_QOS_SCHED_SMT_EXPELLER sched/fair: Start tracking qos_offline tasks count in cfs_rq sched/fair: Introduce QOS_SMT_EXPELL priority reversion mechanism sched/fair: Add cmdline nosmtexpell
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/sched.h | 13 + include/trace/events/sched.h | 55 ++++ init/Kconfig | 9 + kernel/sched/debug.c | 6 + kernel/sched/fair.c | 345 +++++++++++++++++++++++-- kernel/sched/sched.h | 27 ++ 8 files changed, 441 insertions(+), 16 deletions(-)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA
--------------------------------
We introduce the qos smt expeller, which lets online tasks to expel offline tasks on the smt sibling cpus, and exclusively occupy CPU resources.In this way we are able to improve QOS of online tasks in co-location.
Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- init/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index d96c76143610..c189d4d5d91b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -996,6 +996,15 @@ config QOS_SCHED
If in doubt, say N.
+config QOS_SCHED_SMT_EXPELLER + bool "Qos smt expeller" + depends on SCHED_SMT + depends on QOS_SCHED + default n + help + This feature enable online tasks to expel offline tasks + on the smt sibling cpus, and exclusively occupy CPU resources. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA
--------------------------------
We implement the function of qos smt expeller by this following two points: a) when online tasks and offline tasks are running on the same physical cpu, online tasks will send ipi to expel offline tasks on the smt sibling cpus. b) when online tasks are running, the smt sibling cpus will not allow offline tasks to be selected.
Adapted to openEuler-6.4.
Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- include/linux/sched.h | 7 ++ kernel/sched/fair.c | 190 +++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 5 ++ 3 files changed, 200 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6db0879089df..ddf9d10bdc37 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2007,9 +2007,16 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); __get_task_comm(buf, sizeof(buf), tsk); \ })
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +void qos_smt_check_need_resched(void); +#endif + #ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_check_need_resched(); +#endif /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc58182b201f..e6dfa99cb864 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -61,6 +61,10 @@ #include <linux/resume_user_mode.h> #endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +#include <trace/events/ipi.h> +#endif + /* * Targeted preemption latency for CPU-bound tasks: * @@ -185,6 +189,10 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static DEFINE_PER_CPU(int, qos_smt_status); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -8402,6 +8410,132 @@ static void qos_schedule_throttle(struct task_struct *p)
#endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static bool qos_smt_check_siblings_status(int this_cpu) +{ + int cpu; + + if (!sched_smt_active()) + return false; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE) + return true; + } + + return false; +} + +static bool qos_smt_expelled(int this_cpu) +{ + /* + * The qos_smt_status of siblings cpu is online, and current cpu only has + * offline tasks enqueued, there is not suitable task, + * so pick_next_task_fair return null. + */ + if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu)) + return true; + + return false; +} + +static bool qos_smt_update_status(struct task_struct *p) +{ + int status = QOS_LEVEL_OFFLINE; + + if (p != NULL && task_group(p)->qos_level >= QOS_LEVEL_ONLINE) + status = QOS_LEVEL_ONLINE; + + if (__this_cpu_read(qos_smt_status) == status) + return false; + + __this_cpu_write(qos_smt_status, status); + + return true; +} + +static void qos_smt_send_ipi(int this_cpu) +{ + int cpu; + struct rq *rq = NULL; + + if (!sched_smt_active()) + return; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + rq = cpu_rq(cpu); + + /* + * There are two cases where current don't need to send scheduler_ipi: + * a) The qos_smt_status of siblings cpu is online; + * b) The cfs.h_nr_running of siblings cpu is 0. + */ + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE || + rq->cfs.h_nr_running == 0) + continue; + + smp_send_reschedule(cpu); + } +} + +static void qos_smt_expel(int this_cpu, struct task_struct *p) +{ + if (qos_smt_update_status(p)) + qos_smt_send_ipi(this_cpu); +} + +static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) +{ + int cpu; + + if (!sched_smt_active()) + return false; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + /* + * There are two cases rely on the set need_resched to drive away + * offline task: + * a) The qos_smt_status of siblings cpu is online, the task of current + * cpu is offline; + * b) The qos_smt_status of siblings cpu is offline, the task of current + * cpu is idle, and current cpu only has SCHED_IDLE tasks enqueued. + */ + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE && + task_group(current)->qos_level < QOS_LEVEL_ONLINE) + return true; + + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && + rq->curr == rq->idle && sched_idle_cpu(this_cpu)) + return true; + } + + return false; +} + +void qos_smt_check_need_resched(void) +{ + struct rq *rq = this_rq(); + int this_cpu = rq->cpu; + + if (test_tsk_need_resched(current)) + return; + + if (_qos_smt_check_need_resched(this_cpu, rq)) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } +} +#endif + #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { @@ -8442,14 +8576,30 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct sched_entity *se; struct task_struct *p; int new_tasks; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int this_cpu = rq->cpu; +#endif
again: +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + if (qos_smt_expelled(this_cpu)) { + __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + return NULL; + } +#endif + if (!sched_fair_runnable(rq)) goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) - goto simple; + if (!prev || prev->sched_class != &fair_sched_class) { +#ifdef CONFIG_QOS_SCHED + if (cfs_rq->idle_h_nr_running != 0 && rq->online) + goto qos_simple; + else +#endif + goto simple; + }
/* * Because of the set_next_buddy() in dequeue_task_fair() it is rather @@ -8533,6 +8683,34 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf }
goto done; + +#ifdef CONFIG_QOS_SCHED +qos_simple: + if (prev) + put_prev_task(rq, prev); + + do { + se = pick_next_entity(cfs_rq, NULL); + if (check_qos_cfs_rq(group_cfs_rq(se))) { + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_running) + goto idle; + continue; + } + + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + while (se) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + + goto done; +#endif + simple: #endif if (prev) @@ -8565,6 +8743,10 @@ done: __maybe_unused; qos_schedule_throttle(p); #endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_expel(this_cpu, p); +#endif + return p;
idle: @@ -8598,6 +8780,10 @@ done: __maybe_unused; */ update_idle_rq_clock_pelt(rq);
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_expel(this_cpu, NULL); +#endif + return NULL; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7393c1a62513..701a165639e9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1406,6 +1406,11 @@ do { \ } while (0)
#ifdef CONFIG_QOS_SCHED +enum task_qos_level { + QOS_LEVEL_OFFLINE = -1, + QOS_LEVEL_ONLINE = 0, + QOS_LEVEL_MAX +}; void init_qos_hrtimer(int cpu); #endif
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA
--------------------------------
We have added two statistics for qos smt expeller: a) nr_qos_smt_send_ipi:the times of ipi which online task expel offline tasks; b) nr_qos_smt_expelled:the statistics that offline task will not be picked times.
Adapted to 6.4 kernel.
Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- include/linux/sched.h | 6 ++++++ kernel/sched/debug.c | 6 ++++++ kernel/sched/fair.c | 2 ++ 3 files changed, 14 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index ddf9d10bdc37..07e282cfa3e3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,12 @@ struct sched_statistics { u64 nr_wakeups_preferred_cpus; u64 nr_wakeups_force_preferred_cpus; #endif + +#if defined(CONFIG_QOS_SCHED_SMT_EXPELLER) && !defined(__GENKSYMS__) + u64 nr_qos_smt_send_ipi; + u64 nr_qos_smt_expelled; +#endif + #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 940e191d7722..37e71fdd879d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1049,6 +1049,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, } #endif
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + P_SCHEDSTAT(nr_qos_smt_send_ipi); + P_SCHEDSTAT(nr_qos_smt_expelled); +#endif + + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6dfa99cb864..04169bf307b6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8480,6 +8480,7 @@ static void qos_smt_send_ipi(int this_cpu) rq->cfs.h_nr_running == 0) continue;
+ schedstat_inc(current->stats.nr_qos_smt_send_ipi); smp_send_reschedule(cpu); } } @@ -8584,6 +8585,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER if (qos_smt_expelled(this_cpu)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); return NULL; } #endif
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA
--------------------------------
There are two caces that we add tracepoint: a) while online task of sibling cpu is running, it is running that offline task of local cpu will be set TIF_NEED_RESCHED; b) while online task of sibling cpu is running, it will expell that next picked offline task of local cpu.
Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- include/trace/events/sched.h | 55 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 9 ++++-- 2 files changed, 62 insertions(+), 2 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbb99a61f714..4bafb70dfafc 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -268,6 +268,61 @@ TRACE_EVENT(sched_switch, __entry->next_comm, __entry->next_pid, __entry->next_prio) );
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +/* + * Tracepoint for a offline task being resched: + */ +TRACE_EVENT(sched_qos_smt_expel, + + TP_PROTO(struct task_struct *sibling_p, int qos_smt_status), + + TP_ARGS(sibling_p, qos_smt_status), + + TP_STRUCT__entry( + __array( char, sibling_comm, TASK_COMM_LEN ) + __field( pid_t, sibling_pid ) + __field( int, sibling_qos_status ) + __field( int, sibling_cpu ) + ), + + TP_fast_assign( + memcpy(__entry->sibling_comm, sibling_p->comm, TASK_COMM_LEN); + __entry->sibling_pid = sibling_p->pid; + __entry->sibling_qos_status = qos_smt_status; + __entry->sibling_cpu = task_cpu(sibling_p); + ), + + TP_printk("sibling_comm=%s sibling_pid=%d sibling_qos_status=%d sibling_cpu=%d", + __entry->sibling_comm, __entry->sibling_pid, __entry->sibling_qos_status, + __entry->sibling_cpu) +); + +/* + * Tracepoint for a offline task being expelled: + */ +TRACE_EVENT(sched_qos_smt_expelled, + + TP_PROTO(struct task_struct *p, int qos_smt_status), + + TP_ARGS(p, qos_smt_status), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, qos_status ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->qos_status = qos_smt_status; + ), + + TP_printk("comm=%s pid=%d qos_status=%d", + __entry->comm, __entry->pid, __entry->qos_status) +); +#endif + /* * Tracepoint for a task being migrated: */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04169bf307b6..2e5bbf3914c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8511,12 +8511,16 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) * cpu is idle, and current cpu only has SCHED_IDLE tasks enqueued. */ if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE && - task_group(current)->qos_level < QOS_LEVEL_ONLINE) + task_group(current)->qos_level < QOS_LEVEL_ONLINE) { + trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; + }
if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && - rq->curr == rq->idle && sched_idle_cpu(this_cpu)) + rq->curr == rq->idle && sched_idle_cpu(this_cpu)) { + trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; + } }
return false; @@ -8586,6 +8590,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf if (qos_smt_expelled(this_cpu)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); + trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu)); return NULL; } #endif
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA
--------------------------------
Enable CONFIG_QOS_SCHED_SMT_EXPELLER
Signed-off-by: Guan Jing guanjing6@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 698df5e71350..c89f8a7ec669 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -161,6 +161,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 295cd120ab64..d72097778f4b 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -184,6 +184,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_CFS_BANDWIDTH=y CONFIG_QOS_SCHED=y CONFIG_RT_GROUP_SCHED=y
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR
-------------------------------
Track how many tasks are present with qos_offline_policy in each cfs_rq. This will be used by later commits.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- kernel/sched/fair.c | 82 +++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 22 ++++++++++++ 2 files changed, 99 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2e5bbf3914c2..6b5fa7b4ac89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5432,6 +5432,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif
raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5463,6 +5466,10 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif + for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -5476,6 +5483,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif
if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5498,6 +5508,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif }
/* At this point se is NULL and we are at root level*/ @@ -5519,6 +5532,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5558,6 +5574,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -5570,6 +5589,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -5587,6 +5609,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6337,6 +6362,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int qos_idle_h_nr_running = task_has_qos_idle_policy(p); +#endif int task_new = !(flags & ENQUEUE_WAKEUP);
/* @@ -6363,6 +6391,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; +#endif
if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6383,7 +6414,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1;
@@ -6431,6 +6464,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int qos_idle_h_nr_running = task_has_qos_idle_policy(p); +#endif bool was_sched_idle = sched_idle_rq(rq);
util_est_dequeue(&rq->cfs, p); @@ -6441,6 +6477,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; +#endif
if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6473,7 +6512,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1;
@@ -8155,7 +8196,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */ @@ -8165,6 +8208,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -8175,6 +8221,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif
if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -8197,6 +8246,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif }
/* At this point se is NULL and we are at root level*/ @@ -8217,6 +8269,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -8249,6 +8304,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { if (se->on_rq) break; @@ -8258,6 +8316,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8271,6 +8332,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -8429,6 +8493,14 @@ static bool qos_smt_check_siblings_status(int this_cpu) return false; }
+static bool qos_sched_idle_cpu(int this_cpu) +{ + struct rq *rq = cpu_rq(this_cpu); + + return unlikely(rq->nr_running == rq->cfs.qos_idle_h_nr_running && + rq->nr_running); +} + static bool qos_smt_expelled(int this_cpu) { /* @@ -8436,7 +8508,7 @@ static bool qos_smt_expelled(int this_cpu) * offline tasks enqueued, there is not suitable task, * so pick_next_task_fair return null. */ - if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu)) + if (qos_smt_check_siblings_status(this_cpu) && qos_sched_idle_cpu(this_cpu)) return true;
return false; @@ -8517,7 +8589,7 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) }
if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && - rq->curr == rq->idle && sched_idle_cpu(this_cpu)) { + rq->curr == rq->idle && qos_sched_idle_cpu(this_cpu)) { trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 701a165639e9..c1f69a3ce166 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -658,6 +658,14 @@ struct cfs_rq { #if defined(CONFIG_QOS_SCHED) struct list_head qos_throttled_list; #endif + +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + union { + unsigned int qos_idle_h_nr_running; /* qos_level:-1 */ + unsigned long qos_idle_h_nr_running_padding; + }; +#endif + };
static inline int rt_bandwidth_enabled(void) @@ -3250,6 +3258,20 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } #endif
+#ifdef CONFIG_QOS_SCHED +static inline int qos_idle_policy(int policy) +{ + return policy == QOS_LEVEL_OFFLINE; +} +#endif + +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static inline int task_has_qos_idle_policy(struct task_struct *p) +{ + return qos_idle_policy(task_group(p)->qos_level) && p->policy == SCHED_IDLE; +} +#endif + extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR
-------------------------------
Here is the typical case that priority inversion will caused occasionally by SMT expelling: Assuming that there are two SMT cores-cA and cB, online tasks are running on cA while offline tasks on cB. With SMT expelling, online task will drives off offline tasks to occupy all SMT cores exclusively, which, in turn, will starve the offline task to release the related resources other tasks with higher priority need.
Hence, this patch will introduce another mechanism to alleviate this situation. For all offline tasks, one metric of profiling the maximum task expelling duration is set up and the default value is 5 seconds, if such offline task exists, all offline tasks will be allowed to run into one small sleep(msleep) loop in kernel before they goes into usermode; and further, if the two SMT cores(such as cA and cB) are idle or don't get any online tasks to run, for these offline tasks, they will continue to run in usermode for the next schedule.
Adapt to 6.4 kernel.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- kernel/sched/fair.c | 46 +++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b5fa7b4ac89..1e2e54eabe8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -187,6 +187,7 @@ static DEFINE_PER_CPU(int, qos_cpu_overload); unsigned int sysctl_overload_detect_period = 5000; /* in ms */ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); +static bool qos_smt_expelled(int this_cpu); #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -8183,6 +8184,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED +static inline bool qos_timer_is_activated(int cpu) +{ + return hrtimer_active(per_cpu_ptr(&qos_overload_timer, cpu)); +} + +static inline void cancel_qos_timer(int cpu) +{ + hrtimer_cancel(per_cpu_ptr(&qos_overload_timer, cpu)); +}
static inline bool is_offline_task(struct task_struct *p) { @@ -8255,7 +8265,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) sub_nr_running(rq, task_delta);
done: - if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + if (!qos_timer_is_activated(cpu_of(rq))) start_qos_hrtimer(cpu_of(rq));
cfs_rq->throttled = QOS_THROTTLED; @@ -8346,10 +8356,6 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) unthrottle_throttle:
assert_list_leaf_cfs_rq(rq); - - /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_curr(rq); }
static int __unthrottle_qos_cfs_rqs(int cpu) @@ -8371,11 +8377,10 @@ static int __unthrottle_qos_cfs_rqs(int cpu) static int unthrottle_qos_cfs_rqs(int cpu) { int res; - res = __unthrottle_qos_cfs_rqs(cpu); - if (res) - hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu)));
+ if (qos_timer_is_activated(cpu) && !qos_smt_expelled(cpu)) + cancel_qos_timer(cpu); return res; }
@@ -8431,8 +8436,13 @@ static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) struct rq *rq = this_rq();
rq_lock_irqsave(rq, &rf); - if (__unthrottle_qos_cfs_rqs(smp_processor_id())) - __this_cpu_write(qos_cpu_overload, 1); + __unthrottle_qos_cfs_rqs(smp_processor_id()); + __this_cpu_write(qos_cpu_overload, 1); + + /* Determine whether we need to wake up potentially idle CPU. */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); + rq_unlock_irqrestore(rq, &rf);
return HRTIMER_NORESTART; @@ -8472,6 +8482,13 @@ static void qos_schedule_throttle(struct task_struct *p) } }
+#ifndef CONFIG_QOS_SCHED_SMT_EXPELLER +static bool qos_smt_expelled(int this_cpu) +{ + return false; +} +#endif + #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -8659,8 +8676,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
again: #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - if (qos_smt_expelled(this_cpu)) { + if (qos_smt_expelled(this_cpu) && !__this_cpu_read(qos_cpu_overload)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + + if (!qos_timer_is_activated(this_cpu)) + start_qos_hrtimer(this_cpu); + schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu)); return NULL; @@ -8851,7 +8872,8 @@ done: __maybe_unused; goto again; }
- __this_cpu_write(qos_cpu_overload, 0); + if (!qos_smt_expelled(cpu_of(rq))) + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR
-------------------------------
Add cmdline nosmtexpell to disable qos_smt_expell when we want to close.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Xia Fukun xiafukun@huawei.com --- kernel/sched/fair.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e2e54eabe8f..54b60e21fa43 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8492,6 +8492,15 @@ static bool qos_smt_expelled(int this_cpu) #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +DEFINE_STATIC_KEY_TRUE(qos_smt_expell_switch); + +static int __init qos_sched_smt_noexpell_setup(char *__unused) +{ + static_branch_disable(&qos_smt_expell_switch); + return 1; +} +__setup("nosmtexpell", qos_sched_smt_noexpell_setup); + static bool qos_smt_check_siblings_status(int this_cpu) { int cpu; @@ -8520,6 +8529,9 @@ static bool qos_sched_idle_cpu(int this_cpu)
static bool qos_smt_expelled(int this_cpu) { + if (!static_branch_likely(&qos_smt_expell_switch)) + return false; + /* * The qos_smt_status of siblings cpu is online, and current cpu only has * offline tasks enqueued, there is not suitable task, @@ -8576,15 +8588,29 @@ static void qos_smt_send_ipi(int this_cpu)
static void qos_smt_expel(int this_cpu, struct task_struct *p) { + if (!static_branch_likely(&qos_smt_expell_switch)) + return; + if (qos_smt_update_status(p)) qos_smt_send_ipi(this_cpu); }
+static inline bool qos_smt_enabled(void) +{ + if (!static_branch_likely(&qos_smt_expell_switch)) + return false; + + if (!sched_smt_active()) + return false; + + return true; +} + static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) { int cpu;
- if (!sched_smt_active()) + if (!qos_smt_enabled()) return false;
for_each_cpu(cpu, cpu_smt_mask(this_cpu)) {
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/2037 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Q...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/2037 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Q...