[PTACH openEuler-23.09 v1 07/15] sched: Introduce handle priority reversion mechanism

29 Aug 2023

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA
--------------------------------
When online tasks occupy cpu long time, offline task will not get cpu
to run, the priority inversion issue may be triggered in this case.
If the above case occurs, we will unthrottle offline tasks and let its
get a chance to run.
When online tasks occupy cpu over 5s(defaule value), we will unthrottle
offline tasks and enter a msleep loop before exit to usermode util the
cpu goto idle.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com
---
 include/linux/sched.h        |  7 +++
 include/linux/sched/sysctl.h |  5 +++
 kernel/entry/common.c        |  7 ++-
 kernel/sched/core.c          |  3 ++
 kernel/sched/fair.c          | 84 ++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h         |  4 ++
 kernel/sysctl.c              | 24 +++++++++++
 7 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b64333aa7af..e92c57841135 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2468,6 +2468,13 @@ static inline bool dynamic_affinity_enabled(void)
 #endif
 #ifdef CONFIG_QOS_SCHED
 void sched_move_offline_task(struct task_struct *p);
+void sched_qos_offline_wait(void);
+int sched_qos_cpu_overload(void);
+#else
+static inline int sched_qos_cpu_overload(void)
+{
+	return 0;
+}
 #endif
#endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index ede157a678f8..28d9be8e4614 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -32,4 +32,9 @@ extern int sysctl_numa_balancing_mode;
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 extern int sysctl_sched_util_low_pct;
 #endif
+#ifdef CONFIG_QOS_SCHED
+extern unsigned int sysctl_overload_detect_period;
+extern unsigned int sysctl_offline_wait_interval;
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index be61332c66b5..e3df7fdfd901 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -170,6 +170,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
    	if (ti_work & _TIF_NOTIFY_RESUME)
    		resume_user_mode_work(regs);
+#ifdef CONFIG_QOS_SCHED
+		sched_qos_offline_wait();
+#endif
+
    	/* Architecture specific TIF work */
    	arch_exit_to_user_mode_work(regs, ti_work);
@@ -200,7 +204,8 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
    tick_nohz_user_enter_prepare();
ti_work = read_thread_flags();
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+	if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) ||
+		      sched_qos_cpu_overload()))
    	ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b82bddac1352..f72cd213a784 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9981,6 +9981,9 @@ void __init sched_init(void)
    	 * We achieve this by letting root_task_group's tasks sit
    	 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
    	 */
+#ifdef CONFIG_QOS_SCHED
+		init_qos_hrtimer(i);
+#endif
    	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fee251393313..1fe68608877a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -56,6 +56,10 @@
 #include "stats.h"
 #include "autogroup.h"
+#ifdef CONFIG_QOS_SCHED
+#include <linux/delay.h>
+#endif
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -166,6 +170,10 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer);
+static DEFINE_PER_CPU(int, qos_cpu_overload);
+unsigned int sysctl_overload_detect_period = 5000;  /* in ms */
+unsigned int sysctl_offline_wait_interval = 100;  /* in ms */
 static int unthrottle_qos_cfs_rqs(int cpu);
 #endif
@@ -8107,6 +8115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 }
#ifdef CONFIG_QOS_SCHED
+static void start_qos_hrtimer(int cpu);
 static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
    struct rq *rq = rq_of(cfs_rq);
@@ -8140,6 +8149,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
    if (!se)
    	sub_nr_running(rq, task_delta);
+	if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq))))
+		start_qos_hrtimer(cpu_of(rq));
+
    cfs_rq->throttled = 1;
    cfs_rq->throttled_clock = rq_clock(rq);
@@ -8195,7 +8207,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
    	resched_curr(rq);
 }
-static int unthrottle_qos_cfs_rqs(int cpu)
+static int __unthrottle_qos_cfs_rqs(int cpu)
 {
    struct cfs_rq *cfs_rq, *tmp_rq;
    int res = 0;
@@ -8211,11 +8223,25 @@ static int unthrottle_qos_cfs_rqs(int cpu)
    return res;
 }
+static int unthrottle_qos_cfs_rqs(int cpu)
+{
+	int res;
+
+	res = __unthrottle_qos_cfs_rqs(cpu);
+	if (res)
+		hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu)));
+
+	return res;
+}
+
 static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
+	if (unlikely(__this_cpu_read(qos_cpu_overload)))
+		return false;
+
    if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 &&
-		     !sched_idle_cpu(smp_processor_id()) &&
-		     cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
+		!sched_idle_cpu(smp_processor_id()) &&
+		cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
    	throttle_qos_cfs_rq(cfs_rq);
    	return true;
    }
@@ -8233,6 +8259,56 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq)
    	unthrottle_qos_cfs_rq(cfs_rq);
    rq_unlock_irqrestore(rq, &rf);
 }
+
+void sched_qos_offline_wait(void)
+{
+	long qos_level;
+
+	while (unlikely(this_cpu_read(qos_cpu_overload))) {
+		rcu_read_lock();
+		qos_level = task_group(current)->qos_level;
+		rcu_read_unlock();
+		if (qos_level != -1 || signal_pending(current))
+			break;
+		msleep_interruptible(sysctl_offline_wait_interval);
+	}
+}
+
+int sched_qos_cpu_overload(void)
+{
+	return __this_cpu_read(qos_cpu_overload);
+}
+
+static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer)
+{
+	struct rq_flags rf;
+	struct rq *rq = this_rq();
+
+	rq_lock_irqsave(rq, &rf);
+	if (__unthrottle_qos_cfs_rqs(smp_processor_id()))
+		__this_cpu_write(qos_cpu_overload, 1);
+	rq_unlock_irqrestore(rq, &rf);
+
+	return HRTIMER_NORESTART;
+}
+
+static void start_qos_hrtimer(int cpu)
+{
+	ktime_t time;
+	struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu));
+
+	time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period);
+	hrtimer_set_expires(hrtimer, time);
+	hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED);
+}
+
+void init_qos_hrtimer(int cpu)
+{
+	struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu));
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	hrtimer->function = qos_overload_timer_handler;
+}
 #endif
#ifdef CONFIG_SMP
@@ -8418,6 +8494,8 @@ done: __maybe_unused;
    	rq->idle_stamp = 0;
    	goto again;
    }
+
+	__this_cpu_write(qos_cpu_overload, 0);
 #endif
    /*
     * rq is about to be idle, check if we need to update the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5a8afed85a10..feafe8661075 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1401,6 +1401,10 @@ do {						\
    flags = _raw_spin_rq_lock_irqsave(rq);	\
 } while (0)
+#ifdef CONFIG_QOS_SCHED
+void init_qos_hrtimer(int cpu);
+#endif
+
 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index acc20b417dc8..e9af234bf882 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
 static const int six_hundred_forty_kb = 640 * 1024;
 #endif
+#ifdef CONFIG_QOS_SCHED
+static int one_thousand = 1000;
+static int hundred_thousand = 100000;
+#endif
static const int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
@@ -2045,6 +2049,26 @@ static struct ctl_table kern_table[] = {
    	.extra1		= SYSCTL_ZERO,
    	.extra2		= SYSCTL_ONE,
    },
+#ifdef CONFIG_QOS_SCHED
+	{
+		.procname	= "qos_overload_detect_period_ms",
+		.data		= &sysctl_overload_detect_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE_HUNDRED,
+		.extra2		= &hundred_thousand,
+	},
+	{
+		.procname	= "qos_offline_wait_interval_ms",
+		.data		= &sysctl_offline_wait_interval,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE_HUNDRED,
+		.extra2		= &one_thousand,
+	},
+#endif
    {
    	.procname	= "max_rcu_stall_to_panic",
    	.data		= &sysctl_max_rcu_stall_to_panic,
-- 
2.25.1


    

2025

2024

2023

2022

2021

2020

2019

[PTACH openEuler-23.09 v1 07/15] sched: Introduce handle priority reversion mechanism