From: Hui Tang tanghui20@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7CGD0 CVE: NA
----------------------------------------
Deadlock occurs in two situations as follows:
The first case:
tg_set_dynamic_affinity_mode --- raw_spin_lock_irq(&auto_affi->lock); ->start_auto_affintiy --- trigger timer ->tg_update_task_prefer_cpus >css_task_inter_next ->raw_spin_unlock_irq
hr_timer_run_queues ->sched_auto_affi_period_timer --- try spin lock (&auto_affi->lock)
The second case as follows:
[ 291.470810] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: [ 291.472715] rcu: 1-...0: (0 ticks this GP) idle=a6a/1/0x4000000000000002 softirq=78516/78516 fqs=5249 [ 291.475268] rcu: (detected by 6, t=21006 jiffies, g=202169, q=9862) [ 291.477038] Sending NMI from CPU 6 to CPUs 1: [ 291.481268] NMI backtrace for cpu 1 [ 291.481273] CPU: 1 PID: 1923 Comm: sh Kdump: loaded Not tainted 4.19.90+ #150 [ 291.481278] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 [ 291.481281] RIP: 0010:queued_spin_lock_slowpath+0x136/0x9a0 [ 291.481289] Code: c0 74 3f 49 89 dd 48 89 dd 48 b8 00 00 00 00 00 fc ff df 49 c1 ed 03 83 e5 07 49 01 c5 83 c5 03 48 83 05 c4 66 b9 05 01 f3 90 <41> 0f b6 45 00 40 38 c5 7c 08 84 c0 0f 85 ad 07 00 00 0 [ 291.481292] RSP: 0018:ffff88801de87cd8 EFLAGS: 00000002 [ 291.481297] RAX: 0000000000000101 RBX: ffff888001be0a28 RCX: ffffffffb8090f7d [ 291.481301] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff888001be0a28 [ 291.481304] RBP: 0000000000000003 R08: ffffed100037c146 R09: ffffed100037c146 [ 291.481307] R10: 000000001106b143 R11: ffffed100037c145 R12: 1ffff11003bd0f9c [ 291.481311] R13: ffffed100037c145 R14: fffffbfff7a38dee R15: dffffc0000000000 [ 291.481315] FS: 00007fac4f306740(0000) GS:ffff88801de80000(0000) knlGS:0000000000000000 [ 291.481318] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 291.481321] CR2: 00007fac4f4bb650 CR3: 00000000046b6000 CR4: 00000000000006e0 [ 291.481323] Call Trace: [ 291.481324] <IRQ> [ 291.481326] ? osq_unlock+0x2a0/0x2a0 [ 291.481329] ? check_preemption_disabled+0x4c/0x290 [ 291.481331] ? rcu_accelerate_cbs+0x33/0xed0 [ 291.481333] _raw_spin_lock_irqsave+0x83/0xa0 [ 291.481336] sched_auto_affi_period_timer+0x251/0x820 [ 291.481338] ? __remove_hrtimer+0x151/0x200 [ 291.481340] __hrtimer_run_queues+0x39d/0xa50 [ 291.481343] ? tg_update_affinity_domain_down+0x460/0x460 [ 291.481345] ? enqueue_hrtimer+0x2e0/0x2e0 [ 291.481348] ? ktime_get_update_offsets_now+0x1d7/0x2c0 [ 291.481350] hrtimer_run_queues+0x243/0x470 [ 291.481352] run_local_timers+0x5e/0x150 [ 291.481354] update_process_times+0x36/0xb0 [ 291.481357] tick_sched_handle.isra.4+0x7c/0x180 [ 291.481359] tick_nohz_handler+0xd1/0x1d0 [ 291.481365] smp_apic_timer_interrupt+0x12c/0x4e0 [ 291.481368] apic_timer_interrupt+0xf/0x20 [ 291.481370] </IRQ> [ 291.481372] ? smp_call_function_many+0x68c/0x840 [ 291.481375] ? smp_call_function_many+0x6ab/0x840 [ 291.481377] ? arch_unregister_cpu+0x60/0x60 [ 291.481379] ? native_set_fixmap+0x100/0x180 [ 291.481381] ? arch_unregister_cpu+0x60/0x60 [ 291.481384] ? set_task_select_cpus+0x116/0x940 [ 291.481386] ? smp_call_function+0x53/0xc0 [ 291.481388] ? arch_unregister_cpu+0x60/0x60 [ 291.481390] ? on_each_cpu+0x49/0xf0 [ 291.481393] ? set_task_select_cpus+0x115/0x940 [ 291.481395] ? text_poke_bp+0xff/0x180 [ 291.481397] ? poke_int3_handler+0xc0/0xc0 [ 291.481400] ? __set_prefer_cpus_ptr.constprop.4+0x1cd/0x900 [ 291.481402] ? hrtick+0x1b0/0x1b0 [ 291.481404] ? set_task_select_cpus+0x115/0x940 [ 291.481407] ? __jump_label_transform.isra.0+0x3a1/0x470 [ 291.481409] ? kernel_init+0x280/0x280 [ 291.481411] ? kasan_check_read+0x1d/0x30 [ 291.481413] ? mutex_lock+0x96/0x100 [ 291.481415] ? __mutex_lock_slowpath+0x30/0x30 [ 291.481418] ? arch_jump_label_transform+0x52/0x80 [ 291.481420] ? set_task_select_cpus+0x115/0x940 [ 291.481422] ? __jump_label_update+0x1a1/0x1e0 [ 291.481424] ? jump_label_update+0x2ee/0x3b0 [ 291.481427] ? static_key_slow_inc_cpuslocked+0x1c8/0x2d0 [ 291.481430] ? start_auto_affinity+0x190/0x200 [ 291.481432] ? tg_set_dynamic_affinity_mode+0xad/0xf0 [ 291.481435] ? cpu_affinity_mode_write_u64+0x22/0x30 [ 291.481437] ? cgroup_file_write+0x46f/0x660 [ 291.481439] ? cgroup_init_cftypes+0x300/0x300 [ 291.481441] ? __mutex_lock_slowpath+0x30/0x30
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zhang Changzhong zhangchangzhong@huawei.com Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- include/linux/sched/grid_qos.h | 12 ++++ kernel/sched/core.c | 9 +-- kernel/sched/fair.c | 107 ++++++++++++++++++++------------- kernel/sched/grid/qos.c | 14 +++-- 4 files changed, 88 insertions(+), 54 deletions(-)
diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h index cea2bf651880..23d08dbb6ae6 100644 --- a/include/linux/sched/grid_qos.h +++ b/include/linux/sched/grid_qos.h @@ -2,6 +2,7 @@ #ifndef _LINUX_SCHED_GRID_QOS_H #define _LINUX_SCHED_GRID_QOS_H #include <linux/nodemask.h> +#include <linux/sched.h>
#ifdef CONFIG_QOS_SCHED_SMART_GRID enum sched_grid_qos_class { @@ -61,6 +62,7 @@ struct sched_grid_qos_power {
struct sched_grid_qos_affinity { nodemask_t mem_preferred_node_mask; + const struct cpumask *prefer_cpus; };
struct task_struct; @@ -72,6 +74,11 @@ struct sched_grid_qos { int (*affinity_set)(struct task_struct *p); };
+static inline int sched_qos_affinity_set(struct task_struct *p) +{ + return p->grid_qos->affinity_set(p); +} + int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig); void sched_grid_qos_free(struct task_struct *p);
@@ -88,5 +95,10 @@ sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) { return preferred_nid; } + +static inline int sched_qos_affinity_set(struct task_struct *p) +{ + return 0; +} #endif #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e3ef981a2fa2..2628382a226c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9530,9 +9530,6 @@ static inline s64 cpu_smt_expell_read(struct cgroup_subsys_state *css, int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode) { struct auto_affinity *auto_affi = tg->auto_affinity; - int ret = 0; - - raw_spin_lock_irq(&auto_affi->lock);
/* auto mode*/ if (mode == 1) { @@ -9540,14 +9537,10 @@ int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode) } else if (mode == 0) { stop_auto_affinity(auto_affi); } else { - raw_spin_unlock_irq(&auto_affi->lock); return -EINVAL; }
- auto_affi->mode = mode; - raw_spin_unlock_irq(&auto_affi->lock); - - return ret; + return 0; }
static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fca264cfcd1a..50534a27f891 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -28,9 +28,7 @@ #include <linux/delay.h> #include <linux/tracehook.h> #endif -#ifdef CONFIG_QOS_SCHED_SMART_GRID #include <linux/sched/grid_qos.h> -#endif #include <linux/bpf_sched.h>
/* @@ -5821,6 +5819,7 @@ static inline unsigned long cpu_util(int cpu); static unsigned long capacity_of(int cpu); static int sched_idle_cpu(int cpu); static unsigned long cpu_runnable(struct rq *rq); +static inline bool prefer_cpus_valid(struct task_struct *p);
int sysctl_affinity_adjust_delay_ms = 5000;
@@ -5836,22 +5835,29 @@ static void smart_grid_usage_dec(void) static_key_slow_dec(&__smart_grid_used); }
-static void tg_update_task_prefer_cpus(struct task_group *tg) +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) { - struct affinity_domain *ad = &tg->auto_affinity->ad; - struct task_struct *task; - struct css_task_iter it; + struct affinity_domain *ad;
- css_task_iter_start(&tg->css, 0, &it); - while ((task = css_task_iter_next(&it))) { - if (tg == &root_task_group && !task->mm) - continue; + if (!smart_grid_used()) + return p->prefer_cpus;
- set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]); - /* grid_qos must not be NULL */ - task->grid_qos->affinity_set(task); - } - css_task_iter_end(&it); + if (task_group(p)->auto_affinity->mode == 0) + return (void *)p->cpus_ptr; + + ad = &task_group(p)->auto_affinity->ad; + return ad->domains[ad->curr_level]; +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + if (!prefer_cpus_valid(p)) + return -1; + + if (smart_grid_used()) + return task_group(p)->auto_affinity->mode == 0 ? -1 : 1; + + return 0; }
static void affinity_domain_up(struct task_group *tg) @@ -5872,8 +5878,6 @@ static void affinity_domain_up(struct task_group *tg)
if (level == ad->dcount) return; - - tg_update_task_prefer_cpus(tg); }
static void affinity_domain_down(struct task_group *tg) @@ -5894,8 +5898,6 @@ static void affinity_domain_down(struct task_group *tg)
if (!level) return; - - tg_update_task_prefer_cpus(tg); }
static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) @@ -5961,8 +5963,6 @@ static int tg_update_affinity_domain_down(struct task_group *tg, void *data) if (!smart_grid_used()) return 0;
- if (auto_affi->mode) - tg_update_task_prefer_cpus(tg); return 0; }
@@ -5980,35 +5980,41 @@ void tg_update_affinity_domains(int cpu, int online)
void start_auto_affinity(struct auto_affinity *auto_affi) { - struct task_group *tg = auto_affi->tg; ktime_t delay_ms;
- if (auto_affi->period_active == 1) + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 1) { + raw_spin_unlock_irq(&auto_affi->lock); return; - - tg_update_task_prefer_cpus(tg); + }
auto_affi->period_active = 1; + auto_affi->mode = 1; delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms); hrtimer_forward_now(&auto_affi->period_timer, delay_ms); hrtimer_start_expires(&auto_affi->period_timer, HRTIMER_MODE_ABS_PINNED); + raw_spin_unlock_irq(&auto_affi->lock); + smart_grid_usage_inc(); }
void stop_auto_affinity(struct auto_affinity *auto_affi) { - struct task_group *tg = auto_affi->tg; struct affinity_domain *ad = &auto_affi->ad;
- if (auto_affi->period_active == 0) + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 0) { + raw_spin_unlock_irq(&auto_affi->lock); return; + }
hrtimer_cancel(&auto_affi->period_timer); auto_affi->period_active = 0; + auto_affi->mode = 0; ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + raw_spin_unlock_irq(&auto_affi->lock);
- tg_update_task_prefer_cpus(tg); smart_grid_usage_dec(); }
@@ -6226,6 +6232,19 @@ static void destroy_auto_affinity(struct task_group *tg) } #else static void destroy_auto_affinity(struct task_group *tg) {} + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + return p->prefer_cpus; +} +#endif + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + return 0; +} + #endif
/************************************************** @@ -7748,10 +7767,11 @@ int sysctl_sched_util_low_pct = 85;
static inline bool prefer_cpus_valid(struct task_struct *p) { - return p->prefer_cpus && - !cpumask_empty(p->prefer_cpus) && - !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && - cpumask_subset(p->prefer_cpus, p->cpus_ptr); + struct cpumask *prefer_cpus = task_prefer_cpus(p); + + return !cpumask_empty(prefer_cpus) && + !cpumask_equal(prefer_cpus, p->cpus_ptr) && + cpumask_subset(prefer_cpus, p->cpus_ptr); }
static inline unsigned long taskgroup_cpu_util(struct task_group *tg, @@ -7786,20 +7806,23 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, long min_util = INT_MIN; struct task_group *tg; long spare; - int cpu; + int cpu, mode;
- p->select_cpus = p->cpus_ptr; - if (!prefer_cpus_valid(p)) + rcu_read_lock(); + mode = dynamic_affinity_mode(p); + if (mode == -1) { + rcu_read_unlock(); return; - - if (smart_grid_used()) { - p->select_cpus = p->prefer_cpus; + } else if (mode == 1) { + p->select_cpus = task_prefer_cpus(p); if (idlest_cpu) *idlest_cpu = cpumask_first(p->select_cpus); + sched_qos_affinity_set(p); + rcu_read_unlock(); return; }
- rcu_read_lock(); + /* manual mode */ tg = task_group(p); for_each_cpu(cpu, p->prefer_cpus) { if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) { @@ -7867,13 +7890,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f time = schedstat_start_time();
/* - * required for stable ->cpus_allowed + * required for stable ->cpus_ptr */ lockdep_assert_held(&p->pi_lock);
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY p->select_cpus = p->cpus_ptr; - if (dynamic_affinity_used()) + if (dynamic_affinity_used() || smart_grid_used()) set_task_select_cpus(p, &idlest_cpu, sd_flag); #endif
@@ -9464,7 +9487,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY p->select_cpus = p->cpus_ptr; - if (dynamic_affinity_used()) + if (dynamic_affinity_used() || smart_grid_used()) set_task_select_cpus(p, NULL, 0); if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { #else diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c index 3fb433d213fd..7081bee588ee 100644 --- a/kernel/sched/grid/qos.c +++ b/kernel/sched/grid/qos.c @@ -24,20 +24,26 @@ #include <linux/sched/grid_qos.h> #include "internal.h"
-static int qos_affinity_set(struct task_struct *p) +static inline int qos_affinity_set(struct task_struct *p) { int n; struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity;
- nodes_clear(affinity->mem_preferred_node_mask); + if (likely(affinity->prefer_cpus == p->select_cpus)) + return 0; + /* * We want the memory allocation to be as close to the CPU * as possible, and adjust after getting memory bandwidth usage. */ - for (n = 0; n < nr_node_ids; n++) - if (cpumask_intersects(cpumask_of_node(n), p->prefer_cpus)) + for (n = 0; n < nr_node_ids; n++) { + if (cpumask_intersects(cpumask_of_node(n), p->select_cpus)) node_set(n, affinity->mem_preferred_node_mask); + else + node_clear(n, affinity->mem_preferred_node_mask); + }
+ affinity->prefer_cpus = p->select_cpus; return 0; }