From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I64OUS CVE: NA
-------------------------------
When a cfs_rq throttled by qos, mark cfs_rq->throttled as 1, and cfs bw will unthrottled this cfs_rq by mistake, it cause a list_del_valid warning. So add macro QOS_THROTTLED(=2), when a cfs_rq is throttled by qos, we mark the cfs_rq->throttled as QOS_THROTTLED, will check the value of cfs_rq->throttled before unthrottle a cfs_rq.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: zheng zucheng zhengzucheng@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/sched/fair.c | 59 ++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a34ca843bf0a..0dba06ce0677 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -103,6 +103,13 @@ int __weak arch_asym_cpu_priority(int cpu) #endif
#ifdef CONFIG_QOS_SCHED + +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); @@ -4649,6 +4656,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq)];
+#ifdef CONFIG_QOS_SCHED + /* + * if this cfs_rq throttled by qos, not need unthrottle it. + */ + if (cfs_rq->throttled == QOS_THROTTLED) + return; +#endif + cfs_rq->throttled = 0;
update_rq_clock(rq); @@ -7066,37 +7081,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ #ifdef CONFIG_QOS_SCHED static void start_qos_hrtimer(int cpu);
-static int qos_tg_unthrottle_up(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - cfs_rq->throttle_count--; - - return 0; -} - -static int qos_tg_throttle_down(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - cfs_rq->throttle_count++; - - return 0; -} - static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; + unsigned int prev_nr = rq->cfs.h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock();
task_delta = cfs_rq->h_nr_running; @@ -7118,12 +7114,14 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
if (!se) { sub_nr_running(rq, task_delta); + if (prev_nr >= 2 && prev_nr - task_delta < 2) + overload_clear(rq); }
if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) start_qos_hrtimer(cpu_of(rq));
- cfs_rq->throttled = 1; + cfs_rq->throttled = QOS_THROTTLED;
list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -7135,8 +7133,11 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se; int enqueue = 1; long task_delta, idle_task_delta; + unsigned int prev_nr = rq->cfs.h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq)]; + if (cfs_rq->throttled != QOS_THROTTLED) + return;
cfs_rq->throttled = 0;
@@ -7145,7 +7146,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); rcu_read_unlock();
if (!cfs_rq->load.weight) @@ -7167,12 +7168,20 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) break; }
- assert_list_leaf_cfs_rq(rq); - if (!se) { add_nr_running(rq, task_delta); + if (prev_nr < 2 && prev_nr + task_delta >= 2) + overload_set(rq); + } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + list_add_leaf_cfs_rq(cfs_rq); }
+ assert_list_leaf_cfs_rq(rq); + /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq);
From: Zhihao Cheng chengzhihao1@huawei.com
mainline inclusion from mainline-v6.1 commit 8111964f1b8524c4bb56b02cd9c7a37725ea21fd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I65I9A CVE: NA
-------------------------------
Following concurrent processes:
P1(drop cache) P2(kworker) drop_caches_sysctl_handler drop_slab shrink_slab down_read(&shrinker_rwsem) - LOCK A do_shrink_slab super_cache_scan prune_icache_sb dispose_list evict ext4_evict_inode ext4_clear_inode ext4_discard_preallocations ext4_mb_load_buddy_gfp ext4_mb_init_cache ext4_read_block_bitmap_nowait ext4_read_bh_nowait submit_bh dm_submit_bio do_worker process_deferred_bios commit metadata_operation_failed dm_pool_abort_metadata down_write(&pmd->root_lock) - LOCK B __destroy_persistent_data_objects dm_block_manager_destroy dm_bufio_client_destroy unregister_shrinker down_write(&shrinker_rwsem) thin_map | dm_thin_find_block ↓ down_read(&pmd->root_lock) --> ABBA deadlock
, which triggers hung task:
[ 76.974820] INFO: task kworker/u4:3:63 blocked for more than 15 seconds. [ 76.976019] Not tainted 6.1.0-rc4-00011-g8f17dd350364-dirty #910 [ 76.978521] task:kworker/u4:3 state:D stack:0 pid:63 ppid:2 [ 76.978534] Workqueue: dm-thin do_worker [ 76.978552] Call Trace: [ 76.978564] __schedule+0x6ba/0x10f0 [ 76.978582] schedule+0x9d/0x1e0 [ 76.978588] rwsem_down_write_slowpath+0x587/0xdf0 [ 76.978600] down_write+0xec/0x110 [ 76.978607] unregister_shrinker+0x2c/0xf0 [ 76.978616] dm_bufio_client_destroy+0x116/0x3d0 [ 76.978625] dm_block_manager_destroy+0x19/0x40 [ 76.978629] __destroy_persistent_data_objects+0x5e/0x70 [ 76.978636] dm_pool_abort_metadata+0x8e/0x100 [ 76.978643] metadata_operation_failed+0x86/0x110 [ 76.978649] commit+0x6a/0x230 [ 76.978655] do_worker+0xc6e/0xd90 [ 76.978702] process_one_work+0x269/0x630 [ 76.978714] worker_thread+0x266/0x630 [ 76.978730] kthread+0x151/0x1b0 [ 76.978772] INFO: task test.sh:2646 blocked for more than 15 seconds. [ 76.979756] Not tainted 6.1.0-rc4-00011-g8f17dd350364-dirty #910 [ 76.982111] task:test.sh state:D stack:0 pid:2646 ppid:2459 [ 76.982128] Call Trace: [ 76.982139] __schedule+0x6ba/0x10f0 [ 76.982155] schedule+0x9d/0x1e0 [ 76.982159] rwsem_down_read_slowpath+0x4f4/0x910 [ 76.982173] down_read+0x84/0x170 [ 76.982177] dm_thin_find_block+0x4c/0xd0 [ 76.982183] thin_map+0x201/0x3d0 [ 76.982188] __map_bio+0x5b/0x350 [ 76.982195] dm_submit_bio+0x2b6/0x930 [ 76.982202] __submit_bio+0x123/0x2d0 [ 76.982209] submit_bio_noacct_nocheck+0x101/0x3e0 [ 76.982222] submit_bio_noacct+0x389/0x770 [ 76.982227] submit_bio+0x50/0xc0 [ 76.982232] submit_bh_wbc+0x15e/0x230 [ 76.982238] submit_bh+0x14/0x20 [ 76.982241] ext4_read_bh_nowait+0xc5/0x130 [ 76.982247] ext4_read_block_bitmap_nowait+0x340/0xc60 [ 76.982254] ext4_mb_init_cache+0x1ce/0xdc0 [ 76.982259] ext4_mb_load_buddy_gfp+0x987/0xfa0 [ 76.982263] ext4_discard_preallocations+0x45d/0x830 [ 76.982274] ext4_clear_inode+0x48/0xf0 [ 76.982280] ext4_evict_inode+0xcf/0xc70 [ 76.982285] evict+0x119/0x2b0 [ 76.982290] dispose_list+0x43/0xa0 [ 76.982294] prune_icache_sb+0x64/0x90 [ 76.982298] super_cache_scan+0x155/0x210 [ 76.982303] do_shrink_slab+0x19e/0x4e0 [ 76.982310] shrink_slab+0x2bd/0x450 [ 76.982317] drop_slab+0xcc/0x1a0 [ 76.982323] drop_caches_sysctl_handler+0xb7/0xe0 [ 76.982327] proc_sys_call_handler+0x1bc/0x300 [ 76.982331] proc_sys_write+0x17/0x20 [ 76.982334] vfs_write+0x3d3/0x570 [ 76.982342] ksys_write+0x73/0x160 [ 76.982347] __x64_sys_write+0x1e/0x30 [ 76.982352] do_syscall_64+0x35/0x80 [ 76.982357] entry_SYSCALL_64_after_hwframe+0x63/0xcd
Function metadata_operation_failed() is called when operations failed on dm pool metadata, dm pool will destroy and recreate metadata. So, shrinker will be unregistered and registered, which could down write shrinker_rwsem under pmd_write_lock.
Fix it by allocating dm_block_manager before locking pmd->root_lock and destroying old dm_block_manager after unlocking pmd->root_lock, then old dm_block_manager is replaced with new dm_block_manager under pmd->root_lock. So, shrinker register/unregister could be done without holding pmd->root_lock.
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216676 Fixes: e49e582965b3 ("dm thin: add read only and fail io modes")
Conflicts: drivers/md/dm-thin-metadata.c [ 873f258becc("dm thin metadata: do not write metadata if no changes occurred") is not applied. 6a1b1ddc6a2cf("dm thin metadata: add wrappers for managing write locking of metadata") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- drivers/md/dm-thin-metadata.c | 51 +++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 8 deletions(-)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index c3dc171114dc..80c4f27e99f9 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -712,13 +712,15 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f return r; }
-static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd) +static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd, + bool destroy_bm) { dm_sm_destroy(pmd->data_sm); dm_sm_destroy(pmd->metadata_sm); dm_tm_destroy(pmd->nb_tm); dm_tm_destroy(pmd->tm); - dm_block_manager_destroy(pmd->bm); + if (destroy_bm) + dm_block_manager_destroy(pmd->bm); }
static int __begin_transaction(struct dm_pool_metadata *pmd) @@ -909,7 +911,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) }
if (!pmd->fail_io) - __destroy_persistent_data_objects(pmd); + __destroy_persistent_data_objects(pmd, true);
kfree(pmd); return 0; @@ -1809,19 +1811,52 @@ static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; + struct dm_block_manager *old_bm = NULL, *new_bm = NULL; + + /* fail_io is double-checked with pmd->root_lock held below */ + if (unlikely(pmd->fail_io)) + return r; + + /* + * Replacement block manager (new_bm) is created and old_bm destroyed outside of + * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of + * shrinker associated with the block manager's bufio client vs pmd root_lock). + * - must take shrinker_rwsem without holding pmd->root_lock + */ + new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + THIN_MAX_CONCURRENT_LOCKS);
down_write(&pmd->root_lock); - if (pmd->fail_io) + if (pmd->fail_io) { + up_write(&pmd->root_lock); goto out; + }
__set_abort_with_changes_flags(pmd); - __destroy_persistent_data_objects(pmd); - r = __create_persistent_data_objects(pmd, false); + __destroy_persistent_data_objects(pmd, false); + old_bm = pmd->bm; + if (IS_ERR(new_bm)) { + DMERR("could not create block manager during abort"); + pmd->bm = NULL; + r = PTR_ERR(new_bm); + goto out_unlock; + } + + pmd->bm = new_bm; + r = __open_or_format_metadata(pmd, false); + if (r) { + pmd->bm = NULL; + goto out_unlock; + } + new_bm = NULL; +out_unlock: if (r) pmd->fail_io = true; - -out: up_write(&pmd->root_lock); + dm_block_manager_destroy(old_bm); +out: + if (new_bm && !IS_ERR(new_bm)) + dm_block_manager_destroy(new_bm);
return r; }