Zhang Qiao (5): sched: Introduce qos scheduler for co-location sched: Throttle qos cfs_rq when current cpu is running online task sched: Support kill boost for offline task sched: Introduce handle priority reversion mechanism sched: Enable qos scheduler config
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/cgroup.h | 4 + include/linux/resume_user_mode.h | 5 + include/linux/sched.h | 11 + init/Kconfig | 12 + kernel/cgroup/cgroup.c | 22 ++ kernel/sched/core.c | 155 ++++++++++ kernel/sched/fair.c | 378 ++++++++++++++++++++++++- kernel/sched/sched.h | 12 + kernel/signal.c | 3 + kernel/sysctl.c | 1 - 12 files changed, 603 insertions(+), 2 deletions(-)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8MF4R CVE: NA
--------------------------------
In cloud-native hybrid deployment scenarios, online tasks must preempt offline tasks in a timely and offline tasks can't affect the QoS of online tasks, so we introduce the idea of qos level to scheduler, which now is supported with different scheduler policies. The qos scheduler will change the policy of correlative tasks when the qos level of a task group is modified with cpu.qos_level cpu cgroup file. In this way we are able to satisfy different needs of tasks in different qos levels.
The value of qos_level can be 0 or -1, default value is 0. If qos_level is 0, the group is an online group. otherwise it is an offline group.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- init/Kconfig | 12 +++++ kernel/sched/core.c | 120 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 ++ 3 files changed, 136 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 6d35728b94b2..be94ba25ecca 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -985,6 +985,18 @@ menuconfig CGROUP_SCHED tasks.
if CGROUP_SCHED +config QOS_SCHED + bool "Qos task scheduling" + depends on CGROUP_SCHED + depends on CFS_BANDWIDTH + default n + help + This option enable qos scheduler, and support co-location online + services (Latency Sensitive) and offline tasks. colocation can + effectively improve the resource utilization. + + If in doubt, say N. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a854b71836dd..0888b580a010 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7727,6 +7727,18 @@ static int __sched_setscheduler(struct task_struct *p, } change:
+#ifdef CONFIG_QOS_SCHED + /* + * If the scheduling policy of an offline task is set to a policy + * other than SCHED_IDLE, the online task preemption and cpu resource + * isolation will be invalid, so return -EINVAL in this case. + */ + if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { + task_rq_unlock(rq, p, &rf); + return -EINVAL; + } +#endif + if (user) { #ifdef CONFIG_RT_GROUP_SCHED /* @@ -10345,6 +10357,35 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_QOS_SCHED +static inline int alloc_qos_sched_group(struct task_group *tg, + struct task_group *parent) +{ + tg->qos_level = parent->qos_level; + + return 1; +} + +static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg) +{ + struct sched_attr attr = {0}; + + /* + * No need to re-setcheduler when a task is exiting or the task + * is in an autogroup. + */ + if (!(tsk->flags & PF_EXITING) && + !task_group_is_autogroup(tg) && + (tg->qos_level == -1)) { + attr.sched_priority = 0; + attr.sched_policy = SCHED_IDLE; + attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); + __setscheduler_params(tsk, &attr); + __setscheduler_prio(tsk, normal_prio(tsk)); + } +} +#endif + static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { @@ -10395,6 +10436,11 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_QOS_SCHED + if (!alloc_qos_sched_group(tg, parent)) + goto err; +#endif + if (!alloc_rt_sched_group(tg, parent)) goto err;
@@ -10481,6 +10527,10 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group { tsk->sched_task_group = group;
+#ifdef CONFIG_QOS_SCHED + sched_change_qos_group(tsk, group); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); @@ -11209,6 +11259,69 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_QOS_SCHED +static int tg_change_scheduler(struct task_group *tg, void *data) +{ + int policy; + struct css_task_iter it; + struct sched_param param; + struct task_struct *tsk; + s64 qos_level = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->qos_level = qos_level; + if (qos_level == -1) + policy = SCHED_IDLE; + else + policy = SCHED_NORMAL; + + param.sched_priority = 0; + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_setscheduler(tsk, policy, ¶m); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_qos_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 qos_level) +{ + struct task_group *tg = css_tg(css); + + if (!tg->se[0]) + return -EINVAL; + + if (qos_level != -1 && qos_level != 0) + return -EINVAL; + + if (tg->qos_level == qos_level) + goto done; + + if (tg->qos_level == -1 && qos_level == 0) + return -EINVAL; + + cpus_read_lock(); + if (qos_level == -1) + cfs_bandwidth_usage_inc(); + else + cfs_bandwidth_usage_dec(); + cpus_read_unlock(); + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); + rcu_read_unlock(); +done: + return 0; +} + +static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->qos_level; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11272,6 +11385,13 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, +#endif +#ifdef CONFIG_QOS_SCHED + { + .name = "qos_level", + .read_s64 = cpu_qos_read, + .write_s64 = cpu_qos_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..3586830dbeeb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -403,6 +403,10 @@ struct task_group {
struct cfs_bandwidth cfs_bandwidth;
+#ifdef CONFIG_QOS_SCHED + long qos_level; +#endif + #ifdef CONFIG_UCLAMP_TASK_GROUP /* The two decimal precision [%] value requested from user-space */ unsigned int uclamp_pct[UCLAMP_CNT];
在 2023/12/7 17:02, Wenyu Huang 写道:
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8MF4R CVE: NA
In cloud-native hybrid deployment scenarios, online tasks must preempt offline tasks in a timely and offline tasks can't affect the QoS of online tasks, so we introduce the idea of qos level to scheduler, which now is supported with different scheduler policies. The qos scheduler will change the policy of correlative tasks when the qos level of a task group is modified with cpu.qos_level cpu cgroup file. In this way we are able to satisfy different needs of tasks in different qos levels.
The value of qos_level can be 0 or -1, default value is 0. If qos_level is 0, the group is an online group. otherwise it is an offline group.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com
init/Kconfig | 12 +++++ kernel/sched/core.c | 120 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 ++ 3 files changed, 136 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 6d35728b94b2..be94ba25ecca 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -985,6 +985,18 @@ menuconfig CGROUP_SCHED tasks.
if CGROUP_SCHED +config QOS_SCHED
- bool "Qos task scheduling"
- depends on CGROUP_SCHED
- depends on CFS_BANDWIDTH
- default n
- help
This option enable qos scheduler, and support co-location online
services (Latency Sensitive) and offline tasks. colocation can
effectively improve the resource utilization.
If in doubt, say N.
config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a854b71836dd..0888b580a010 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7727,6 +7727,18 @@ static int __sched_setscheduler(struct task_struct *p, } change:
+#ifdef CONFIG_QOS_SCHED
- /*
* If the scheduling policy of an offline task is set to a policy
* other than SCHED_IDLE, the online task preemption and cpu resource
* isolation will be invalid, so return -EINVAL in this case.
*/
- if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) {
task_rq_unlock(rq, p, &rf);
return -EINVAL;
这里改下: retval = -EINVAL; goto unlock;
- }
+#endif
- if (user) {
#ifdef CONFIG_RT_GROUP_SCHED /* @@ -10345,6 +10357,35 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_QOS_SCHED +static inline int alloc_qos_sched_group(struct task_group *tg,
struct task_group *parent)
+{
- tg->qos_level = parent->qos_level;
- return 1;
+}
+static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg) +{
- struct sched_attr attr = {0};
- /*
* No need to re-setcheduler when a task is exiting or the task
* is in an autogroup.
*/
- if (!(tsk->flags & PF_EXITING) &&
!task_group_is_autogroup(tg) &&
(tg->qos_level == -1)) {
attr.sched_priority = 0;
attr.sched_policy = SCHED_IDLE;
attr.sched_nice = PRIO_TO_NICE(tsk->static_prio);
__setscheduler_params(tsk, &attr);
__setscheduler_prio(tsk, normal_prio(tsk));
- }
+} +#endif
static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { @@ -10395,6 +10436,11 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_QOS_SCHED
- if (!alloc_qos_sched_group(tg, parent))
goto err;
+#endif
- if (!alloc_rt_sched_group(tg, parent)) goto err;
@@ -10481,6 +10527,10 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group { tsk->sched_task_group = group;
+#ifdef CONFIG_QOS_SCHED
- sched_change_qos_group(tsk, group);
+#endif
#ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); @@ -11209,6 +11259,69 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_QOS_SCHED +static int tg_change_scheduler(struct task_group *tg, void *data) +{
- int policy;
- struct css_task_iter it;
- struct sched_param param;
- struct task_struct *tsk;
- s64 qos_level = *(s64 *)data;
- struct cgroup_subsys_state *css = &tg->css;
- tg->qos_level = qos_level;
- if (qos_level == -1)
policy = SCHED_IDLE;
- else
policy = SCHED_NORMAL;
- param.sched_priority = 0;
- css_task_iter_start(css, 0, &it);
- while ((tsk = css_task_iter_next(&it)))
sched_setscheduler(tsk, policy, ¶m);
- css_task_iter_end(&it);
- return 0;
+}
+static int cpu_qos_write(struct cgroup_subsys_state *css,
struct cftype *cftype, s64 qos_level)
+{
- struct task_group *tg = css_tg(css);
- if (!tg->se[0])
return -EINVAL;
- if (qos_level != -1 && qos_level != 0)
return -EINVAL;
- if (tg->qos_level == qos_level)
goto done;
- if (tg->qos_level == -1 && qos_level == 0)
return -EINVAL;
- cpus_read_lock();
- if (qos_level == -1)
cfs_bandwidth_usage_inc();
- else
cfs_bandwidth_usage_dec();
- cpus_read_unlock();
- rcu_read_lock();
- walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level));
- rcu_read_unlock();
+done:
- return 0;
+}
+static inline s64 cpu_qos_read(struct cgroup_subsys_state *css,
struct cftype *cft)
+{
- return css_tg(css)->qos_level;
+} +#endif
static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11272,6 +11385,13 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, +#endif +#ifdef CONFIG_QOS_SCHED
- {
.name = "qos_level",
.read_s64 = cpu_qos_read,
.write_s64 = cpu_qos_write,
- },
#endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..3586830dbeeb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -403,6 +403,10 @@ struct task_group {
struct cfs_bandwidth cfs_bandwidth;
+#ifdef CONFIG_QOS_SCHED
- long qos_level;
+#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP /* The two decimal precision [%] value requested from user-space */ unsigned int uclamp_pct[UCLAMP_CNT];
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8MF4R CVE: NA
--------------------------------
In a co-location scenario, we usually deploy online and offline task groups in the same server.
The online tasks are more important than offline tasks and to avoid offline tasks affects online tasks, we will throttle the offline tasks group when some online task groups running in the same cpu and unthrottle offline tasks when the cpu is about to enter idle state.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- kernel/sched/fair.c | 259 ++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 4 + 2 files changed, 262 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2430c88e0428..b42262c3dd0d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -124,6 +124,18 @@ int __weak arch_asym_cpu_priority(int cpu) #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif
+#ifdef CONFIG_QOS_SCHED + +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static int unthrottle_qos_cfs_rqs(int cpu); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -5639,6 +5651,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq)];
+#ifdef CONFIG_QOS_SCHED + /* + * if this cfs_rq throttled by qos, not need unthrottle it. + */ + if (cfs_rq->throttled == QOS_THROTTLED) + return; +#endif + cfs_rq->throttled = 0;
update_rq_clock(rq); @@ -5823,7 +5843,20 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; #endif
- /* By the above checks, this should never be true */ + /* + * CPU hotplug callbacks race against distribute_cfs_runtime() + * when the QOS_SCHED feature is enabled, there may be + * situations where the runtime_remaining > 0. + * Qos_sched does not care whether the cfs_rq has time left, + * so no longer allocate time to cfs_rq in this scenario. + */ +#ifdef CONFIG_QOS_SCHED + if (cfs_rq->throttled == QOS_THROTTLED && + cfs_rq->runtime_remaining > 0) + goto next; +#endif + + /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock); @@ -6191,6 +6224,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); #endif +#ifdef CONFIG_QOS_SCHED + INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); +#endif }
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6280,6 +6316,9 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * the rq clock again in unthrottle_cfs_rq(). */ rq_clock_start_loop_update(rq); +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif
rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -6305,6 +6344,9 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rcu_read_unlock();
rq_clock_stop_loop_update(rq); +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif }
bool cfs_task_bw_constrained(struct task_struct *p) @@ -8115,6 +8157,196 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ resched_curr(rq); }
+#ifdef CONFIG_QOS_SCHED +static inline bool is_offline_task(struct task_struct *p) +{ + return task_group(p)->qos_level == -1; +} + +static void start_qos_hrtimer(int cpu); + +static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* freeze hierarchy runnable averages while throttled */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } + } + + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + +done: + if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + start_qos_hrtimer(cpu_of(rq)); + + cfs_rq->throttled = QOS_THROTTLED; + + list_add(&cfs_rq->qos_throttled_list, + &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); +} + +static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq)]; + + if (cfs_rq->throttled != QOS_THROTTLED) + return; + + cfs_rq->throttled = 0; + + update_rq_clock(rq); + list_del_init(&cfs_rq->qos_throttled_list); + + /* update hierarchical throttle state */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + rcu_read_unlock(); + + if (!cfs_rq->load.weight) { + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; + } + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + break; + + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + add_nr_running(rq, task_delta); + +unthrottle_throttle: + + assert_list_leaf_cfs_rq(rq); + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); +} + +static int unthrottle_qos_cfs_rqs(int cpu) +{ + struct cfs_rq *cfs_rq, *tmp_rq; + int res = 0; + + list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), + qos_throttled_list) { + if (cfs_rq_throttled(cfs_rq)) { + unthrottle_qos_cfs_rq(cfs_rq); + res++; + } + } + + return res; +} + +static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + + if (!rq_of(cfs_rq)->online) + return false; + + throttle_qos_cfs_rq(cfs_rq); + return true; + } + + return false; +} + +static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq)) + unthrottle_qos_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); +} +#endif + #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { @@ -8205,6 +8437,16 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); +#ifdef CONFIG_QOS_SCHED + if (check_qos_cfs_rq(cfs_rq)) { + cfs_rq = &rq->cfs; + WARN(cfs_rq->nr_running == 0, + "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n", + rq->nr_running, cfs_rq->idle_h_nr_running); + if (unlikely(!cfs_rq->nr_running)) + return NULL; + } +#endif } while (cfs_rq);
p = task_of(se); @@ -8284,6 +8526,12 @@ done: __maybe_unused; if (new_tasks > 0) goto again;
+#ifdef CONFIG_QOS_SCHED + if (unthrottle_qos_cfs_rqs(cpu_of(rq))) { + rq->idle_stamp = 0; + goto again; + } +#endif /* * rq is about to be idle, check if we need to update the * lost_idle_time of clock_pelt @@ -12600,6 +12848,10 @@ void free_fair_sched_group(struct task_group *tg) int i;
for_each_possible_cpu(i) { +#ifdef CONFIG_QOS_SCHED + if (tg->cfs_rq && tg->cfs_rq[i]) + unthrottle_qos_sched_group(tg->cfs_rq[i]); +#endif if (tg->cfs_rq) kfree(tg->cfs_rq[i]); if (tg->se) @@ -12989,6 +13241,11 @@ __init void init_sched_fair_class(void) #endif }
+#ifdef CONFIG_QOS_SCHED + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); +#endif + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3586830dbeeb..3d5ce9a01068 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -653,6 +653,10 @@ struct cfs_rq { #endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ + +#if defined(CONFIG_QOS_SCHED) + struct list_head qos_throttled_list; +#endif };
static inline int rt_bandwidth_enabled(void)
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8MF4R CVE: NA
--------------------------------
If online tasks occupy 100% CPU resources, offline tasks can't be scheduled since offline tasks are throttled, as a result, offline task can't timely respond after receiving SIGKILL signal.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/cgroup.h | 4 ++++ include/linux/sched.h | 4 ++++ kernel/cgroup/cgroup.c | 22 ++++++++++++++++++++++ kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++ kernel/signal.c | 3 +++ 5 files changed, 65 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b307013b9c6c..7fa51b600ee8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -855,4 +855,8 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
#endif /* CONFIG_CGROUP_BPF */
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk); +#endif + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..d796894806f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2457,4 +2457,8 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+#ifdef CONFIG_QOS_SCHED +void sched_move_offline_task(struct task_struct *p); +#endif + #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 518725b57200..487330fedd93 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2936,6 +2936,28 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked ss->post_attach(); }
+#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk) +{ + struct css_set *css; + struct cgroup *cpu_cgrp; + struct cgroup *cpu_root_cgrp; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + + spin_lock_irq(&css_set_lock); + css = task_css_set(tsk); + cpu_cgrp = css->subsys[cpu_cgrp_id]->cgroup; + cpu_root_cgrp = &cpu_cgrp->root->cgrp; + spin_unlock_irq(&css_set_lock); + + (void)cgroup_attach_task(cpu_root_cgrp, tsk, false); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); +} +#endif + static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0888b580a010..ac3de8f7e27d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10384,6 +10384,38 @@ static void sched_change_qos_group(struct task_struct *tsk, struct task_group *t __setscheduler_prio(tsk, normal_prio(tsk)); } } + +struct offline_args { + struct work_struct work; + struct task_struct *p; +}; + +static void sched_move_work(struct work_struct *work) +{ + struct sched_param param = { .sched_priority = 0 }; + struct offline_args *args = container_of(work, struct offline_args, work); + + cgroup_move_task_to_root(args->p); + sched_setscheduler(args->p, SCHED_NORMAL, ¶m); + put_task_struct(args->p); + kfree(args); +} + +void sched_move_offline_task(struct task_struct *p) +{ + struct offline_args *args; + + if (unlikely(task_group(p)->qos_level != -1)) + return; + + args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC); + if (args) { + get_task_struct(p); + args->p = p; + INIT_WORK(&args->work, sched_move_work); + queue_work(system_highpri_wq, &args->work); + } +} #endif
static inline void alloc_uclamp_sched_group(struct task_group *tg, diff --git a/kernel/signal.c b/kernel/signal.c index 09019017d669..28cddef39778 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1060,6 +1060,9 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) signal->group_stop_count = 0; t = p; do { +#ifdef CONFIG_QOS_SCHED + sched_move_offline_task(t); +#endif task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1);
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8MF4R CVE: NA
--------------------------------
When online tasks occupy cpu long time, offline task will not get cpu to run, the priority inversion issue may be triggered in this case. If the above case occurs, we will unthrottle offline tasks and let its get a chance to run. When online tasks occupy cpu over 5s(defaule value), we will unthrottle offline tasks and enter a msleep loop before exit to usermode util the cpu goto idle.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- include/linux/resume_user_mode.h | 5 ++ include/linux/sched.h | 7 ++ kernel/sched/core.c | 3 + kernel/sched/fair.c | 121 ++++++++++++++++++++++++++++++- kernel/sched/sched.h | 4 + kernel/sysctl.c | 1 - 6 files changed, 139 insertions(+), 2 deletions(-)
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index f8f3e958e9cf..255372856812 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,6 +59,11 @@ static inline void resume_user_mode_work(struct pt_regs *regs) blkcg_maybe_throttle_current();
rseq_handle_notify_resume(NULL, regs); + +#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + }
#endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d796894806f4..ab69ae3808bb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2459,6 +2459,13 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop);
#ifdef CONFIG_QOS_SCHED void sched_move_offline_task(struct task_struct *p); +void sched_qos_offline_wait(void); +int sched_qos_cpu_overload(void); +#else +static inline int sched_qos_cpu_overload(void) +{ + return 0; +} #endif
#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac3de8f7e27d..bff68b26d2e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10018,6 +10018,9 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ +#ifdef CONFIG_QOS_SCHED + init_qos_hrtimer(i); +#endif init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b42262c3dd0d..7c2d0104298b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -57,6 +57,11 @@ #include "stats.h" #include "autogroup.h"
+#ifdef CONFIG_QOS_SCHED +#include <linux/delay.h> +#include <linux/resume_user_mode.h> +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -133,6 +138,12 @@ int __weak arch_asym_cpu_priority(int cpu) #define QOS_THROTTLED 2
static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); +static DEFINE_PER_CPU(int, qos_cpu_overload); +unsigned int sysctl_overload_detect_period = 5000; /* in ms */ +unsigned int sysctl_offline_wait_interval = 100; /* in ms */ +static int one_thousand = 1000; +static int hundred_thousand = 100000; static int unthrottle_qos_cfs_rqs(int cpu); #endif
@@ -184,6 +195,26 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_QOS_SCHED + { + .procname = "qos_overload_detect_period_ms", + .data = &sysctl_overload_detect_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &hundred_thousand, + }, + { + .procname = "qos_offline_wait_interval_ms", + .data = &sysctl_offline_wait_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &one_thousand, + }, +#endif {} };
@@ -8303,7 +8334,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); }
-static int unthrottle_qos_cfs_rqs(int cpu) +static int __unthrottle_qos_cfs_rqs(int cpu) { struct cfs_rq *cfs_rq, *tmp_rq; int res = 0; @@ -8319,8 +8350,22 @@ static int unthrottle_qos_cfs_rqs(int cpu) return res; }
+static int unthrottle_qos_cfs_rqs(int cpu) +{ + int res; + + res = __unthrottle_qos_cfs_rqs(cpu); + if (res) + hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + + return res; +} + static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) { + if (unlikely(__this_cpu_read(qos_cpu_overload))) + return false; + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && !sched_idle_cpu(smp_processor_id()) && cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { @@ -8345,6 +8390,74 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) unthrottle_qos_cfs_rq(cfs_rq); rq_unlock_irqrestore(rq, &rf); } + +void sched_qos_offline_wait(void) +{ + long qos_level; + + while (unlikely(this_cpu_read(qos_cpu_overload))) { + rcu_read_lock(); + qos_level = task_group(current)->qos_level; + rcu_read_unlock(); + if (qos_level != -1 || fatal_signal_pending(current)) + break; + + schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval)); + } +} + +int sched_qos_cpu_overload(void) +{ + return __this_cpu_read(qos_cpu_overload); +} + +static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) +{ + struct rq_flags rf; + struct rq *rq = this_rq(); + + rq_lock_irqsave(rq, &rf); + if (__unthrottle_qos_cfs_rqs(smp_processor_id())) + __this_cpu_write(qos_cpu_overload, 1); + rq_unlock_irqrestore(rq, &rf); + + return HRTIMER_NORESTART; +} + +static void start_qos_hrtimer(int cpu) +{ + ktime_t time; + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period); + hrtimer_set_expires(hrtimer, time); + hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED); +} + +void init_qos_hrtimer(int cpu) +{ + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer->function = qos_overload_timer_handler; +} + +/* + * To avoid Priority inversion issues, when this cpu is qos_cpu_overload, + * we should schedule offline tasks to run so that they can leave kernel + * critical sections, and throttle them before returning to user mode. + */ +static void qos_schedule_throttle(struct task_struct *p) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (unlikely(this_cpu_read(qos_cpu_overload))) { + if (is_offline_task(p)) + set_notify_resume(p); + } +} + #endif
#ifdef CONFIG_SMP @@ -8507,6 +8620,10 @@ done: __maybe_unused; update_misfit_status(p, rq); sched_fair_update_stop_tick(rq, p);
+#ifdef CONFIG_QOS_SCHED + qos_schedule_throttle(p); +#endif + return p;
idle: @@ -8531,6 +8648,8 @@ done: __maybe_unused; rq->idle_stamp = 0; goto again; } + + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3d5ce9a01068..3de84e95baf1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1411,6 +1411,10 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0)
+#ifdef CONFIG_QOS_SCHED +void init_qos_hrtimer(int cpu); +#endif + #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f52..21a0047978fb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,7 +96,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); static const int six_hundred_forty_kb = 640 * 1024; #endif
- static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8MF4R CVE: NA
--------------------------------
Enable CONFIG_QOS_SCHED to support qos scheduler.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8f1a4db8d49b..9409921527bd 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -176,6 +176,7 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_MISC is not set +CONFIG_QOS_SCHED=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_NAMESPACES=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 507d199ff598..6b6cde2bbc02 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -185,6 +185,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y +CONFIG_QOS_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y CONFIG_CGROUP_PIDS=y
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3270 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/4...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3270 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/4...