From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5N162 CVE: NA
--------------------------------
In the use of q_usage_counter of request_queue, blk_cleanup_queue using "wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter))" to wait q_usage_counter becoming zero. however, if the q_usage_counter becoming zero quickly, and percpu_ref_exit will execute and ref->data will be freed, maybe another process will cause a null-defef problem like below:
CPU0 CPU1 blk_cleanup_queue blk_freeze_queue blk_mq_freeze_queue_wait scsi_end_request percpu_ref_get ... percpu_ref_put atomic_long_sub_and_test percpu_ref_exit ref->data -> NULL ref->data->release(ref) -> null-deref
Fix it by setting flag(QUEUE_FLAG_USAGE_COUNT_SYNC) to add synchronization mechanism, when ref->data->release is called, the flag will be setted, and the "wait_event" in blk_mq_freeze_queue_wait must wait flag becoming true as well, which will limit percpu_ref_exit to execute ahead of time.
Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 4 +++- block/blk-mq.c | 7 +++++++ include/linux/blk-mq.h | 1 + include/linux/blkdev.h | 2 ++ 4 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 0b496dabc5ac..448e4d70af7f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -385,7 +385,8 @@ void blk_cleanup_queue(struct request_queue *q) * prevent that blk_mq_run_hw_queues() accesses the hardware queues * after draining finished. */ - blk_freeze_queue(q); + blk_freeze_queue_start(q); + blk_mq_freeze_queue_wait_sync(q);
rq_qos_exit(q);
@@ -502,6 +503,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter);
+ blk_queue_flag_set(QUEUE_FLAG_USAGE_COUNT_SYNC, q); wake_up_all(&q->mq_freeze_wq); }
diff --git a/block/blk-mq.c b/block/blk-mq.c index e1fcdbefcac0..ab1b0bfc64f9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -193,6 +193,7 @@ void blk_freeze_queue_start(struct request_queue *q) { mutex_lock(&q->mq_freeze_lock); if (++q->mq_freeze_depth == 1) { + blk_queue_flag_clear(QUEUE_FLAG_USAGE_COUNT_SYNC, q); percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) @@ -203,6 +204,12 @@ void blk_freeze_queue_start(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
+void blk_mq_freeze_queue_wait_sync(struct request_queue *q) +{ + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter) && + test_bit(QUEUE_FLAG_USAGE_COUNT_SYNC, &q->queue_flags)); +} + void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ac83257972a0..e4e46229d0eb 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -574,6 +574,7 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); +void blk_mq_freeze_queue_wait_sync(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49540ce9e325..4c046530edb9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -643,6 +643,8 @@ struct request_queue { #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ /*at least one blk-mq hctx can't get driver tag */ #define QUEUE_FLAG_HCTX_WAIT 30 +/* sync for q_usage_counter */ +#define QUEUE_FLAG_USAGE_COUNT_SYNC 31
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
--------------------------------
This reverts commit 26eda960e3d769b96abfe81eb176319c7b99f7fc.
The related feilds will be modified in later patches which are backported from mainline.
Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 10 ++++------ block/blk-mq-sched.c | 17 +++++++---------- block/blk-mq.c | 7 +++---- block/blk-sysfs.c | 2 +- block/blk.h | 13 ------------- include/linux/blkdev.h | 3 +++ 6 files changed, 18 insertions(+), 34 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 448e4d70af7f..662d9c7b3e79 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -521,15 +521,13 @@ static void blk_timeout_work(struct work_struct *work) struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; - struct request_queue_wrapper *q_wrapper; int ret;
- q_wrapper = kmem_cache_alloc_node(blk_requestq_cachep, + q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); - if (!q_wrapper) + if (!q) return NULL;
- q = &q_wrapper->q; q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); @@ -600,7 +598,7 @@ struct request_queue *blk_alloc_queue(int node_id) fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: - kmem_cache_free(blk_requestq_cachep, q_wrapper); + kmem_cache_free(blk_requestq_cachep, q); return NULL; } EXPORT_SYMBOL(blk_alloc_queue); @@ -1821,7 +1819,7 @@ int __init blk_dev_init(void) panic("Failed to create kblockd\n");
blk_requestq_cachep = kmem_cache_create("request_queue", - sizeof(struct request_queue_wrapper), 0, SLAB_PANIC, NULL); + sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
blk_debugfs_root = debugfs_create_dir("block", NULL);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0aa2069d95d5..721a4f9a014c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -555,14 +555,13 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); struct blk_mq_hw_ctx *hctx; int ret, i; - struct request_queue_wrapper *q_wrapper = queue_to_wrapper(queue);
/* * Set initial depth at max so that we don't need to reallocate for * updating nr_requests. */ - ret = blk_mq_init_bitmaps(&q_wrapper->sched_bitmap_tags, - &q_wrapper->sched_breserved_tags, + ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, + &queue->sched_breserved_tags, MAX_SCHED_RQ, set->reserved_tags, set->numa_node, alloc_policy); if (ret) @@ -570,12 +569,12 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
queue_for_each_hw_ctx(queue, hctx, i) { hctx->sched_tags->bitmap_tags = - &q_wrapper->sched_bitmap_tags; + &queue->sched_bitmap_tags; hctx->sched_tags->breserved_tags = - &q_wrapper->sched_breserved_tags; + &queue->sched_breserved_tags; }
- sbitmap_queue_resize(&q_wrapper->sched_bitmap_tags, + sbitmap_queue_resize(&queue->sched_bitmap_tags, queue->nr_requests - set->reserved_tags);
return 0; @@ -583,10 +582,8 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) { - struct request_queue_wrapper *q_wrapper = queue_to_wrapper(queue); - - sbitmap_queue_free(&q_wrapper->sched_bitmap_tags); - sbitmap_queue_free(&q_wrapper->sched_breserved_tags); + sbitmap_queue_free(&queue->sched_bitmap_tags); + sbitmap_queue_free(&queue->sched_breserved_tags); }
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) diff --git a/block/blk-mq.c b/block/blk-mq.c index ab1b0bfc64f9..61477c824823 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3746,7 +3746,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int i, ret; - struct request_queue_wrapper *q_wrapper = queue_to_wrapper(q);
if (!set) return -EINVAL; @@ -3775,9 +3774,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) nr, true); if (blk_mq_is_sbitmap_shared(set->flags)) { hctx->sched_tags->bitmap_tags = - &q_wrapper->sched_bitmap_tags; + &q->sched_bitmap_tags; hctx->sched_tags->breserved_tags = - &q_wrapper->sched_breserved_tags; + &q->sched_breserved_tags; } } if (ret) @@ -3788,7 +3787,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) { q->nr_requests = nr; if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) - sbitmap_queue_resize(&q_wrapper->sched_bitmap_tags, + sbitmap_queue_resize(&q->sched_bitmap_tags, nr - set->reserved_tags); }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b809c0bf7686..780f02cbda84 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -726,7 +726,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); - kmem_cache_free(blk_requestq_cachep, queue_to_wrapper(q)); + kmem_cache_free(blk_requestq_cachep, q); }
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ diff --git a/block/blk.h b/block/blk.h index b8948fda06e1..3165c16725d5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -28,19 +28,6 @@ struct blk_flush_queue { spinlock_t mq_flush_lock; };
-/* - * The wrapper of request_queue to fix kabi while adding members. - */ -struct request_queue_wrapper { - struct request_queue q; - - struct sbitmap_queue sched_bitmap_tags; - struct sbitmap_queue sched_breserved_tags; -}; - -#define queue_to_wrapper(queue) \ - container_of(queue, struct request_queue_wrapper, q) - extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4c046530edb9..87116833d7ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -499,6 +499,9 @@ struct request_queue {
atomic_t nr_active_requests_shared_sbitmap;
+ struct sbitmap_queue sched_bitmap_tags; + struct sbitmap_queue sched_breserved_tags; + struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 65de57bb2e66f1fbede166c1307570ebd09eae83 category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The original code in commit 24d2f90309b23 ("blk-mq: split out tag initialization, support shared tags") would check tags->rqs is non-NULL and then dereference tags->rqs[].
Then in commit 2af8cbe30531 ("blk-mq: split tag ->rqs[] into two"), we started to dereference tags->static_rqs[], but continued to check non-NULL tags->rqs.
Check tags->static_rqs as non-NULL instead, which is more logical.
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Hannes Reinecke hare@suse.de Link: https://lore.kernel.org/r/1633429419-228500-2-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 61477c824823..2c2e82163d3b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2470,7 +2470,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, { struct page *page;
- if (tags->rqs && set->ops->exit_request) { + if (tags->static_rqs && set->ops->exit_request) { int i;
for (i = 0; i < tags->nr_tags; i++) {
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 8fa044640f128c0cd015f72cda42989a664889fc category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
For shared sbitmap, if the call to blk_mq_tag_update_depth() was successful for any hctx when hctx->sched_tags is not set, then it would be successful for all (due to nature in which blk_mq_tag_update_depth() fails).
As such, there is no need to call blk_mq_tag_resize_shared_sbitmap() for each hctx. So relocate the call until after the hctx iteration under the !q->elevator check, which is equivalent (to !hctx->sched_tags).
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Hannes Reinecke hare@suse.de Link: https://lore.kernel.org/r/1633429419-228500-4-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c2e82163d3b..0f75b7baa026 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3767,8 +3767,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); - if (!ret && blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_tag_resize_shared_sbitmap(set, nr); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); @@ -3786,9 +3784,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } if (!ret) { q->nr_requests = nr; - if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) - sbitmap_queue_resize(&q->sched_bitmap_tags, - nr - set->reserved_tags); + if (blk_mq_is_sbitmap_shared(set->flags)) { + if (q->elevator) + sbitmap_queue_resize(&q->sched_bitmap_tags, + nr - set->reserved_tags); + else + blk_mq_tag_resize_shared_sbitmap(set, nr); + } }
blk_mq_unquiesce_queue(q);
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit f6adcef5f317c78a899ca9766e90e47026834158 category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
It's easier to read:
if (x) X; else Y;
over:
if (!x) Y; else X;
No functional change intended.
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Hannes Reinecke hare@suse.de Link: https://lore.kernel.org/r/1633429419-228500-5-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 0f75b7baa026..4088732cdbf8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3764,18 +3764,18 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ - if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); - } else { + if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); + nr, true); if (blk_mq_is_sbitmap_shared(set->flags)) { hctx->sched_tags->bitmap_tags = &q->sched_bitmap_tags; hctx->sched_tags->breserved_tags = &q->sched_breserved_tags; } + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, + false); } if (ret) break;
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit d99a6bb337677d812d5bef7795c9fcf17f1ccebe category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Function blk_mq_sched_alloc_tags() does same as __blk_mq_alloc_map_and_request(), so give a similar name to be consistent.
Similarly rename label err_free_tags -> err_free_map_and_rqs.
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Hannes Reinecke hare@suse.de Link: https://lore.kernel.org/r/1633429419-228500-6-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-sched.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 721a4f9a014c..943c015b8107 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -514,9 +514,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_put(&q->q_usage_counter); }
-static int blk_mq_sched_alloc_tags(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; int ret; @@ -608,15 +608,15 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) BLKDEV_MAX_RQ);
queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_tags(q, hctx, i); + ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (ret) - goto err_free_tags; + goto err_free_map_and_rqs; }
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { ret = blk_mq_init_sched_shared_sbitmap(q); if (ret) - goto err_free_tags; + goto err_free_map_and_rqs; }
ret = e->ops.init_sched(q, e); @@ -644,7 +644,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) err_free_sbitmap: if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) blk_mq_exit_sched_shared_sbitmap(q); -err_free_tags: +err_free_map_and_rqs: blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL;
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 1820f4f0a5e75633ff384167027bf45ed1b10234 category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
To be more concise and consistent in naming, rename blk_mq_sched_free_requests() -> blk_mq_sched_free_rqs().
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/1633429419-228500-7-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 2 +- block/blk-mq-sched.c | 6 +++--- block/blk-mq-sched.h | 2 +- block/blk.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 662d9c7b3e79..7b6d5c8c036f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -414,7 +414,7 @@ void blk_cleanup_queue(struct request_queue *q) */ mutex_lock(&q->sysfs_lock); if (q->elevator) - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); mutex_unlock(&q->sysfs_lock);
percpu_ref_exit(&q->q_usage_counter); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 943c015b8107..ef81653ff22d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -630,7 +630,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -645,7 +645,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) blk_mq_exit_sched_shared_sbitmap(q); err_free_map_and_rqs: - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; return ret; @@ -655,7 +655,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ -void blk_mq_sched_free_requests(struct request_queue *q) +void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index b228ee067491..4f514e074b74 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -29,7 +29,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); -void blk_mq_sched_free_requests(struct request_queue *q); +void blk_mq_sched_free_rqs(struct request_queue *q);
static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk.h b/block/blk.h index 3165c16725d5..5a9fb24c2ed9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -212,7 +212,7 @@ static inline void elevator_exit(struct request_queue *q, { lockdep_assert_held(&q->sysfs_lock);
- blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); __elevator_exit(q, e); }
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit f32e4eafaf29aa493bdea3123dac09ad135b599b category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Function blk_mq_clear_rq_mapping() will be used for shared sbitmap tags in future, so pass a driver tags pointer instead of the tagset container and HW queue index.
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/1633429419-228500-8-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 4088732cdbf8..c564f8b0120b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2432,10 +2432,9 @@ static size_t order_to_size(unsigned int order) }
/* called before freeing request pool in @tags */ -static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, + struct blk_mq_tags *tags) { - struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; struct page *page; unsigned long flags;
@@ -2444,7 +2443,7 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, unsigned long end = start + order_to_size(page->private); int i;
- for (i = 0; i < set->queue_depth; i++) { + for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq;
@@ -2468,8 +2467,11 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { + struct blk_mq_tags *drv_tags; struct page *page;
+ drv_tags = set->tags[hctx_idx]; + if (tags->static_rqs && set->ops->exit_request) { int i;
@@ -2483,7 +2485,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } }
- blk_mq_clear_rq_mapping(set, tags, hctx_idx); + blk_mq_clear_rq_mapping(drv_tags, tags);
while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru);
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 4f245d5bf0f7432c881e22a77066160a6cba8e03 category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Function blk_mq_clear_rq_mapping() is required to clear the sched tags mappings in driver tags rqs[].
But there is no need for a driver tags to clear its own mapping, so skip clearing the mapping in this scenario.
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/1633429419-228500-9-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/block/blk-mq.c b/block/blk-mq.c index c564f8b0120b..fdaf3284a676 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2438,6 +2438,10 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct page *page; unsigned long flags;
+ /* There is no need to clear a driver tags own mapping */ + if (drv_tags == tags) + return; + list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private);
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit a7e7388dced47a10ca13ae95ca975ea2830f196b category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Put the functionality to update the sched shared sbitmap size in a common function.
Since the same formula is always used to resize, and it can be got from the request queue argument, so just pass the request queue pointer.
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Hannes Reinecke hare@suse.de Link: https://lore.kernel.org/r/1633429419-228500-10-git-send-email-john.garry@hua... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-sched.c | 3 +-- block/blk-mq-tag.c | 6 ++++++ block/blk-mq-tag.h | 1 + block/blk-mq.c | 3 +-- 4 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index ef81653ff22d..dedb6740da69 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -574,8 +574,7 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) &queue->sched_breserved_tags; }
- sbitmap_queue_resize(&queue->sched_bitmap_tags, - queue->nr_requests - set->reserved_tags); + blk_mq_tag_update_sched_shared_sbitmap(queue);
return 0; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 98e4edd03ad4..727440e8c890 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -701,6 +701,12 @@ void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int s sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); }
+void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q) +{ + sbitmap_queue_resize(&q->sched_bitmap_tags, + q->nr_requests - q->tag_set->reserved_tags); +} + /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index baa36e5f495d..5178ea4aa746 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -61,6 +61,7 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, unsigned int depth, bool can_grow); extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size); +extern void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, diff --git a/block/blk-mq.c b/block/blk-mq.c index fdaf3284a676..92c2876efeba 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3792,8 +3792,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) q->nr_requests = nr; if (blk_mq_is_sbitmap_shared(set->flags)) { if (q->elevator) - sbitmap_queue_resize(&q->sched_bitmap_tags, - nr - set->reserved_tags); + blk_mq_tag_update_sched_shared_sbitmap(q); else blk_mq_tag_resize_shared_sbitmap(set, nr); }
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 63064be150e4b1ba1e4af594ef5aa81adf21a52d category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Add a function to combine allocating tags and the associated requests, and factor out common patterns to use this new function.
Some function only call blk_mq_alloc_map_and_rqs() now, but more functionality will be added later.
Also make blk_mq_alloc_rq_map() and blk_mq_alloc_rqs() static since they are only used in blk-mq.c, and finally rename some functions for conciseness and consistency with other function names: - __blk_mq_alloc_map_and_{request -> rqs}() - blk_mq_alloc_{map_and_requests -> set_map_and_rqs}()
Suggested-by: Ming Lei ming.lei@redhat.com Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/1633429419-228500-11-git-send-email-john.garry@hua... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-sched.c | 15 +++-------- block/blk-mq-tag.c | 9 +------ block/blk-mq.c | 62 +++++++++++++++++++++++++------------------- block/blk-mq.h | 9 ++----- 4 files changed, 42 insertions(+), 53 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index dedb6740da69..a382fc31b8a6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -518,21 +518,12 @@ static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - struct blk_mq_tag_set *set = q->tag_set; - int ret; + hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, + q->nr_requests);
- hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM; - - ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) { - blk_mq_free_rq_map(hctx->sched_tags, set->flags); - hctx->sched_tags = NULL; - } - - return ret; + return 0; }
/* called in queue's release handler, tagset has gone away */ diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 727440e8c890..c986251cf10c 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -659,7 +659,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; - bool ret;
if (!can_grow) return -EINVAL; @@ -671,15 +670,9 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > MAX_SCHED_RQ) return -EINVAL;
- new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, set->flags); + new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; - ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); - if (ret) { - blk_mq_free_rq_map(new, set->flags); - return -ENOMEM; - }
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); blk_mq_free_rq_map(*tagsptr, set->flags); diff --git a/block/blk-mq.c b/block/blk-mq.c index 92c2876efeba..2d388a7c76a8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2513,11 +2513,11 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) blk_mq_free_tags(tags, flags); }
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags) +static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags, + unsigned int flags) { struct blk_mq_tags *tags; int node; @@ -2565,8 +2565,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return 0; }
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth) +static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; @@ -2985,25 +2986,34 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } }
-static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, - int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int depth) { - unsigned int flags = set->flags; - int ret = 0; + struct blk_mq_tags *tags; + int ret;
- set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, - set->queue_depth, set->reserved_tags, flags); - if (!set->tags[hctx_idx]) - return false; + tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags, + set->flags); + if (!tags) + return NULL;
- ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, - set->queue_depth); - if (!ret) - return true; + ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); + if (ret) { + blk_mq_free_rq_map(tags, set->flags); + return NULL; + }
- blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; - return false; + return tags; +} + +static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + int hctx_idx) +{ + set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, + set->queue_depth); + + return set->tags[hctx_idx]; }
static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, @@ -3048,7 +3058,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && - !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this @@ -3515,7 +3525,7 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) int i;
for (i = 0; i < set->nr_hw_queues; i++) { - if (!__blk_mq_alloc_map_and_request(set, i)) + if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } @@ -3534,7 +3544,7 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) +static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; @@ -3694,7 +3704,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_mq_map;
- ret = blk_mq_alloc_map_and_requests(set); + ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map;
diff --git a/block/blk-mq.h b/block/blk-mq.h index ad2d74f887f2..f4021afa443a 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -55,13 +55,8 @@ void blk_mq_put_rq_ref(struct request *rq); void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags); -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth); +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, unsigned int depth);
/* * Internal helpers for request insertion into sw queues
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 645db34e50501aac141713fb47a315e5202ff890 category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Refactor blk_mq_free_map_and_requests() such that it can be used at many sites at which the tag map and rqs are freed.
Also rename to blk_mq_free_map_and_rqs(), which is shorter and matches the alloc equivalent.
Suggested-by: Ming Lei ming.lei@redhat.com Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Link: https://lore.kernel.org/r/1633429419-228500-12-git-send-email-john.garry@hua... Signed-off-by: Jens Axboe axboe@kernel.dk
Conflict: commit a846a8e6c9a5 ("blk-mq: don't free tags if the tag_set is used by other device in queue initialztion") is already backported, blk_mq_free_map_and_rqs() is moved to __blk_mq_update_nr_hw_queues() instead of blk_mq_realloc_hw_ctxs(). Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-tag.c | 3 +-- block/blk-mq.c | 38 +++++++++++++++++++++++--------------- block/blk-mq.h | 4 +++- 3 files changed, 27 insertions(+), 18 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c986251cf10c..7b0c09ac1de9 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -674,8 +674,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (!new) return -ENOMEM;
- blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, set->flags); + blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); *tagsptr = new; } else { /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 2d388a7c76a8..d7fe8c5c5369 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3016,15 +3016,15 @@ static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, return set->tags[hctx_idx]; }
-static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx) { unsigned int flags = set->flags;
- if (set->tags && set->tags[hctx_idx]) { - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; + if (tags) { + blk_mq_free_rqs(set, tags, hctx_idx); + blk_mq_free_rq_map(tags, flags); } }
@@ -3105,8 +3105,10 @@ static void blk_mq_map_swqueue(struct request_queue *q) * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) - blk_mq_free_map_and_requests(set, i); + if (i && set->tags[i]) { + blk_mq_free_map_and_rqs(set, set->tags[i], i); + set->tags[i] = NULL; + }
hctx->tags = NULL; continue; @@ -3533,8 +3535,10 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0;
out_unwind: - while (--i >= 0) - blk_mq_free_map_and_requests(set, i); + while (--i >= 0) { + blk_mq_free_map_and_rqs(set, set->tags[i], i); + set->tags[i] = NULL; + }
return -ENOMEM; } @@ -3724,8 +3728,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) return 0;
out_free_mq_rq_maps: - for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); + for (i = 0; i < set->nr_hw_queues; i++) { + blk_mq_free_map_and_rqs(set, set->tags[i], i); + set->tags[i] = NULL; + } out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); @@ -3741,8 +3747,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j;
- for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); + for (i = 0; i < set->nr_hw_queues; i++) { + blk_mq_free_map_and_rqs(set, set->tags[i], i); + set->tags[i] = NULL; + }
if (blk_mq_is_sbitmap_shared(set->flags)) blk_mq_exit_shared_sbitmap(set); @@ -3932,7 +3940,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); + blk_mq_free_map_and_rqs(set, set->tags[i], i);
set->nr_hw_queues = prev_nr_hw_queues; blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); diff --git a/block/blk-mq.h b/block/blk-mq.h index f4021afa443a..15ce74c266a6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -57,7 +57,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); - +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx); /* * Internal helpers for request insertion into sw queues */
From: Nikolay Borisov nborisov@suse.com
mainline inclusion from mainline-v5.13-rc1 commit 39aa56db50b9ca5cad597e561b4b160b6cbbb65b category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Signed-off-by: Nikolay Borisov nborisov@suse.com Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Himanshu Madhani himanshu.madhani@oracle.com Link: https://lore.kernel.org/r/20210311081713.2763171-1-nborisov@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-tag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 7b0c09ac1de9..5acbb4cd265d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -624,7 +624,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock);
- if (flags & BLK_MQ_F_TAG_HCTX_SHARED) + if (blk_mq_is_sbitmap_shared(flags)) return tags;
if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { @@ -636,7 +636,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) { - if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { + if (!blk_mq_is_sbitmap_shared(flags)) { sbitmap_queue_free(tags->bitmap_tags); sbitmap_queue_free(tags->breserved_tags); }
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit e155b0c238b20f0a866f4334d292656665836c8a category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently we use separate sbitmap pairs and active_queues atomic_t for shared sbitmap support.
However a full sets of static requests are used per HW queue, which is quite wasteful, considering that the total number of requests usable at any given time across all HW queues is limited by the shared sbitmap depth.
As such, it is considerably more memory efficient in the case of shared sbitmap to allocate a set of static rqs per tag set or request queue, and not per HW queue.
So replace the sbitmap pairs and active_queues atomic_t with a shared tags per tagset and request queue, which will hold a set of shared static rqs.
Since there is now no valid HW queue index to be passed to the blk_mq_ops .init and .exit_request callbacks, pass an invalid index token. This changes the semantics of the APIs, such that the callback would need to validate the HW queue index before using it. Currently no user of shared sbitmap actually uses the HW queue index (as would be expected).
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/1633429419-228500-13-git-send-email-john.garry@hua... Signed-off-by: Jens Axboe axboe@kernel.dk Conflict: c6f9c0e2d53d ("blk-mq: allow hardware queue to get more tag while sharing a tag set") is merged, which will cause lots of conflicts for this patch, and in the meantime, the functionality will need to be adapted in that patch. Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-debugfs.c | 18 -------- block/blk-mq-sched.c | 82 ++++++++++++++++----------------- block/blk-mq-tag.c | 75 ++++++++++-------------------- block/blk-mq-tag.h | 6 +-- block/blk-mq.c | 102 +++++++++++++++++++++-------------------- block/blk-mq.h | 15 +++--- include/linux/blk-mq.h | 16 +++---- include/linux/blkdev.h | 4 +- 8 files changed, 133 insertions(+), 185 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f3a263a1bb43..6eca71417415 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -216,23 +216,6 @@ static int queue_tag_set_show(void *data, struct seq_file *m) seq_printf(m, "numa_node=%d\n", set->numa_node); seq_printf(m, "timeout=%u\n", set->timeout); seq_printf(m, "flags=%u\n", set->flags); - seq_printf(m, "active_queues_shared_sbitmap=%d\n", - atomic_read(&set->active_queues_shared_sbitmap)); - seq_printf(m, "pending_queues_shared_sbitmap=%d\n", - atomic_read(&set->pending_queues_shared_sbitmap)); - - return 0; -} - -static int queue_dtag_wait_time_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - unsigned int time = 0; - - if (test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) - time = jiffies_to_msecs(jiffies - READ_ONCE(q->dtag_wait_time)); - - seq_printf(m, "%u\n", time);
return 0; } @@ -245,7 +228,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, { "tag_set", 0400, queue_tag_set_show, NULL }, - { "dtag_wait_time_ms", 0400, queue_dtag_wait_time_show, NULL }, { }, };
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a382fc31b8a6..36cfae6e1dd3 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -518,6 +518,11 @@ static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + hctx->sched_tags = q->shared_sbitmap_tags; + return 0; + } + hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, q->nr_requests);
@@ -526,61 +531,54 @@ static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, return 0; }
+static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) +{ + blk_mq_free_rq_map(queue->shared_sbitmap_tags); + queue->shared_sbitmap_tags = NULL; +} + /* called in queue's release handler, tagset has gone away */ -static void blk_mq_sched_tags_teardown(struct request_queue *q) +static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; int i;
queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); + if (!blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_free_rq_map(hctx->sched_tags); hctx->sched_tags = NULL; } } + + if (blk_mq_is_sbitmap_shared(flags)) + blk_mq_exit_sched_shared_sbitmap(q); }
static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) { struct blk_mq_tag_set *set = queue->tag_set; - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - struct blk_mq_hw_ctx *hctx; - int ret, i;
/* * Set initial depth at max so that we don't need to reallocate for * updating nr_requests. */ - ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, - &queue->sched_breserved_tags, - MAX_SCHED_RQ, set->reserved_tags, - set->numa_node, alloc_policy); - if (ret) - return ret; - - queue_for_each_hw_ctx(queue, hctx, i) { - hctx->sched_tags->bitmap_tags = - &queue->sched_bitmap_tags; - hctx->sched_tags->breserved_tags = - &queue->sched_breserved_tags; - } + queue->shared_sbitmap_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!queue->shared_sbitmap_tags) + return -ENOMEM;
blk_mq_tag_update_sched_shared_sbitmap(queue);
return 0; }
-static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) -{ - sbitmap_queue_free(&queue->sched_bitmap_tags); - sbitmap_queue_free(&queue->sched_breserved_tags); -} - int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { + unsigned int i, flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; - unsigned int i; int ret;
if (!e) { @@ -597,21 +595,21 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, BLKDEV_MAX_RQ);
- queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); + if (blk_mq_is_sbitmap_shared(flags)) { + ret = blk_mq_init_sched_shared_sbitmap(q); if (ret) - goto err_free_map_and_rqs; + return ret; }
- if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { - ret = blk_mq_init_sched_shared_sbitmap(q); + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (ret) goto err_free_map_and_rqs; }
ret = e->ops.init_sched(q, e); if (ret) - goto err_free_sbitmap; + goto err_free_map_and_rqs;
blk_mq_debugfs_register_sched(q);
@@ -631,12 +629,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return 0;
-err_free_sbitmap: - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) - blk_mq_exit_sched_shared_sbitmap(q); err_free_map_and_rqs: blk_mq_sched_free_rqs(q); - blk_mq_sched_tags_teardown(q); + blk_mq_sched_tags_teardown(q, flags); + q->elevator = NULL; return ret; } @@ -650,9 +646,15 @@ void blk_mq_sched_free_rqs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i;
- queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) - blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + blk_mq_free_rqs(q->tag_set, q->shared_sbitmap_tags, + BLK_MQ_NO_HCTX_IDX); + } else { + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, + hctx->sched_tags, i); + } } }
@@ -673,8 +675,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); - blk_mq_sched_tags_teardown(q); - if (blk_mq_is_sbitmap_shared(flags)) - blk_mq_exit_sched_shared_sbitmap(q); + blk_mq_sched_tags_teardown(q, flags); q->elevator = NULL; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5acbb4cd265d..6341718c6471 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -28,11 +28,10 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_sbitmap_shared(hctx->flags)) { struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - atomic_inc(&set->active_queues_shared_sbitmap); + atomic_inc(&hctx->tags->active_queues); } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) @@ -59,14 +58,14 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; - struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set;
if (blk_mq_is_sbitmap_shared(hctx->flags)) { + struct request_queue *q = hctx->queue; + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; - atomic_dec(&set->active_queues_shared_sbitmap); + atomic_dec(&tags->active_queues); } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; @@ -80,12 +79,11 @@ void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_sbitmap_shared(hctx->flags)) { struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) { - WRITE_ONCE(q->dtag_wait_time, jiffies); - atomic_inc(&set->pending_queues_shared_sbitmap); + WRITE_ONCE(hctx->dtag_wait_time, jiffies); + atomic_inc(&hctx->tags->pending_queues); } } else { if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && @@ -100,14 +98,13 @@ void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force) { struct blk_mq_tags *tags = hctx->tags; struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set;
if (blk_mq_is_sbitmap_shared(hctx->flags)) { if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) return;
if (!force && time_before(jiffies, - READ_ONCE(q->dtag_wait_time) + + READ_ONCE(hctx->dtag_wait_time) + BLK_MQ_DTAG_WAIT_EXPIRE)) return;
@@ -115,8 +112,8 @@ void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force) &q->queue_flags)) return;
- WRITE_ONCE(q->dtag_wait_time, jiffies); - atomic_dec(&set->pending_queues_shared_sbitmap); + WRITE_ONCE(hctx->dtag_wait_time, jiffies); + atomic_dec(&tags->pending_queues); } else { if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) return; @@ -577,38 +574,10 @@ static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, return 0; }
-int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) -{ - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - int i, ret; - - ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, - set->queue_depth, set->reserved_tags, - set->numa_node, alloc_policy); - if (ret) - return ret; - - for (i = 0; i < set->nr_hw_queues; i++) { - struct blk_mq_tags *tags = set->tags[i]; - - tags->bitmap_tags = &set->__bitmap_tags; - tags->breserved_tags = &set->__breserved_tags; - } - - return 0; -} - -void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) -{ - sbitmap_queue_free(&set->__bitmap_tags); - sbitmap_queue_free(&set->__breserved_tags); -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, - int node, unsigned int flags) + int node, int alloc_policy) { - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) { @@ -624,9 +593,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock);
- if (blk_mq_is_sbitmap_shared(flags)) - return tags; - if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { kfree(tags); return NULL; @@ -634,12 +600,10 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, return tags; }
-void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) +void blk_mq_free_tags(struct blk_mq_tags *tags) { - if (!blk_mq_is_sbitmap_shared(flags)) { - sbitmap_queue_free(tags->bitmap_tags); - sbitmap_queue_free(tags->breserved_tags); - } + sbitmap_queue_free(tags->bitmap_tags); + sbitmap_queue_free(tags->breserved_tags); kfree(tags); }
@@ -670,6 +634,13 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > MAX_SCHED_RQ) return -EINVAL;
+ /* + * Only the sbitmap needs resizing since we allocated the max + * initially. + */ + if (blk_mq_is_sbitmap_shared(set->flags)) + return 0; + new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; @@ -690,12 +661,14 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) { - sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); + struct blk_mq_tags *tags = set->shared_sbitmap_tags; + + sbitmap_queue_resize(&tags->__bitmap_tags, size - set->reserved_tags); }
void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q) { - sbitmap_queue_resize(&q->sched_bitmap_tags, + sbitmap_queue_resize(q->shared_sbitmap_tags->bitmap_tags, q->nr_requests - q->tag_set->reserved_tags); }
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 5178ea4aa746..2cd9e8f14b4f 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -43,16 +43,14 @@ struct blk_mq_tags {
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, - int node, unsigned int flags); -extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); + int node, int alloc_policy); +extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy);
-extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); -extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); diff --git a/block/blk-mq.c b/block/blk-mq.c index d7fe8c5c5369..a5c18b303c41 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2474,7 +2474,10 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct blk_mq_tags *drv_tags; struct page *page;
- drv_tags = set->tags[hctx_idx]; + if (blk_mq_is_sbitmap_shared(set->flags)) + drv_tags = set->shared_sbitmap_tags; + else + drv_tags = set->tags[hctx_idx];
if (tags->static_rqs && set->ops->exit_request) { int i; @@ -2503,21 +2506,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } }
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) +void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL;
- blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); }
static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags) + unsigned int reserved_tags) { struct blk_mq_tags *tags; int node; @@ -2526,7 +2528,8 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL;
@@ -2534,7 +2537,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; }
@@ -2543,7 +2546,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, node); if (!tags->static_rqs) { kfree(tags->rqs); - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; }
@@ -2993,14 +2996,13 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags; int ret;
- tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags, - set->flags); + tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); if (!tags) return NULL;
ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { - blk_mq_free_rq_map(tags, set->flags); + blk_mq_free_rq_map(tags); return NULL; }
@@ -3010,6 +3012,12 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, int hctx_idx) { + if (blk_mq_is_sbitmap_shared(set->flags)) { + set->tags[hctx_idx] = set->shared_sbitmap_tags; + + return true; + } + set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, set->queue_depth);
@@ -3020,14 +3028,21 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { - unsigned int flags = set->flags; - if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); - blk_mq_free_rq_map(tags, flags); + blk_mq_free_rq_map(tags); } }
+static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (!blk_mq_is_sbitmap_shared(set->flags)) + blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); + + set->tags[hctx_idx] = NULL; +} + static void blk_mq_map_swqueue(struct request_queue *q) { unsigned int i, j, hctx_idx; @@ -3105,10 +3120,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) { - blk_mq_free_map_and_rqs(set, set->tags[i], i); - set->tags[i] = NULL; - } + if (i) + __blk_mq_free_map_and_rqs(set, i);
hctx->tags = NULL; continue; @@ -3482,7 +3495,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock);
q->nr_requests = set->queue_depth; - q->dtag_wait_time = jiffies;
/* * Default to classic polling @@ -3526,6 +3538,14 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i;
+ if (blk_mq_is_sbitmap_shared(set->flags)) { + set->shared_sbitmap_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + set->queue_depth); + if (!set->shared_sbitmap_tags) + return -ENOMEM; + } + for (i = 0; i < set->nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; @@ -3535,9 +3555,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0;
out_unwind: - while (--i >= 0) { - blk_mq_free_map_and_rqs(set, set->tags[i], i); - set->tags[i] = NULL; + while (--i >= 0) + __blk_mq_free_map_and_rqs(set, i); + + if (blk_mq_is_sbitmap_shared(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_sbitmap_tags, + BLK_MQ_NO_HCTX_IDX); }
return -ENOMEM; @@ -3712,26 +3735,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_mq_map;
- if (blk_mq_is_sbitmap_shared(set->flags)) { - atomic_set(&set->active_queues_shared_sbitmap, 0); - atomic_set(&set->pending_queues_shared_sbitmap, 0); - - if (blk_mq_init_shared_sbitmap(set)) { - ret = -ENOMEM; - goto out_free_mq_rq_maps; - } - } - mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list);
return 0;
-out_free_mq_rq_maps: - for (i = 0; i < set->nr_hw_queues; i++) { - blk_mq_free_map_and_rqs(set, set->tags[i], i); - set->tags[i] = NULL; - } out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); @@ -3747,13 +3755,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j;
- for (i = 0; i < set->nr_hw_queues; i++) { - blk_mq_free_map_and_rqs(set, set->tags[i], i); - set->tags[i] = NULL; - } + for (i = 0; i < set->nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i);
- if (blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_exit_shared_sbitmap(set); + if (blk_mq_is_sbitmap_shared(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_sbitmap_tags, + BLK_MQ_NO_HCTX_IDX); + }
for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); @@ -3791,12 +3799,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); - if (blk_mq_is_sbitmap_shared(set->flags)) { - hctx->sched_tags->bitmap_tags = - &q->sched_bitmap_tags; - hctx->sched_tags->breserved_tags = - &q->sched_breserved_tags; - } } else { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); @@ -3940,7 +3942,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_rqs(set, set->tags[i], i); + __blk_mq_free_map_and_rqs(set, i);
set->nr_hw_queues = prev_nr_hw_queues; blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); diff --git a/block/blk-mq.h b/block/blk-mq.h index 15ce74c266a6..5572277cf9a3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -54,7 +54,7 @@ void blk_mq_put_rq_ref(struct request *rq); */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); +void blk_mq_free_rq_map(struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, @@ -313,24 +313,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, if (bt->sb.depth == 1) return true;
+ if (mq_unfair_dtag && !atomic_read(&hctx->tags->pending_queues)) + return true; + if (blk_mq_is_sbitmap_shared(hctx->flags)) { struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set;
- if (mq_unfair_dtag && - !atomic_read(&set->pending_queues_shared_sbitmap)) - return true; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; - users = atomic_read(&set->active_queues_shared_sbitmap); } else { - if (mq_unfair_dtag && !atomic_read(&hctx->tags->pending_queues)) - return true; if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; - users = atomic_read(&hctx->tags->active_queues); }
+ users = atomic_read(&hctx->tags->active_queues); + if (!users) return true;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e4e46229d0eb..1707f217c636 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -248,13 +248,11 @@ enum hctx_type { * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. - * @active_queues_shared_sbitmap: - * number of active request queues per tag set. - * @__bitmap_tags: A shared tags sbitmap, used over all hctx's - * @__breserved_tags: - * A shared reserved tags sbitmap, used over all hctx's * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. + * @shared_sbitmap_tags: + * Shared sbitmap set of tags. Has @nr_hw_queues elements. If + * set, shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. @@ -271,13 +269,11 @@ struct blk_mq_tag_set { unsigned int timeout; unsigned int flags; void *driver_data; - atomic_t active_queues_shared_sbitmap; - atomic_t pending_queues_shared_sbitmap;
- struct sbitmap_queue __bitmap_tags; - struct sbitmap_queue __breserved_tags; struct blk_mq_tags **tags;
+ struct blk_mq_tags *shared_sbitmap_tags; + struct mutex tag_list_lock; struct list_head tag_list;
@@ -472,6 +468,8 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT)
+#define BLK_MQ_NO_HCTX_IDX (-1U) + struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 87116833d7ed..3acfadd6724b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -499,8 +499,7 @@ struct request_queue {
atomic_t nr_active_requests_shared_sbitmap;
- struct sbitmap_queue sched_bitmap_tags; - struct sbitmap_queue sched_breserved_tags; + struct blk_mq_tags *shared_sbitmap_tags;
struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP @@ -605,7 +604,6 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS];
- unsigned long dtag_wait_time; KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3)
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit ae0f1a732f4a5db284e2af02c305255734efd19c category: performance bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Now that we use shared tags for shared sbitmap support, we don't require the tags sbitmap pointers, so drop them.
This essentially reverts commit 222a5ae03cdd ("blk-mq: Use pointers for blk_mq_tags bitmap tags").
Function blk_mq_init_bitmap_tags() is removed also, since it would be only a wrappper for blk_mq_init_bitmaps().
Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Hannes Reinecke hare@suse.de Signed-off-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/1633429419-228500-14-git-send-email-john.garry@hua... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/bfq-iosched.c | 4 +-- block/blk-mq-debugfs.c | 8 +++--- block/blk-mq-tag.c | 56 +++++++++++++++--------------------------- block/blk-mq-tag.h | 7 ++---- block/blk-mq.c | 8 +++--- block/kyber-iosched.c | 4 +-- 6 files changed, 34 insertions(+), 53 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f32ad0b84049..4bfea5e5354e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6381,8 +6381,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) struct blk_mq_tags *tags = hctx->sched_tags; unsigned int min_shallow;
- min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); + min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); }
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 6eca71417415..ab8040485442 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -474,11 +474,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, atomic_read(&tags->pending_queues));
seq_puts(m, "\nbitmap_tags:\n"); - sbitmap_queue_show(tags->bitmap_tags, m); + sbitmap_queue_show(&tags->bitmap_tags, m);
if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); - sbitmap_queue_show(tags->breserved_tags, m); + sbitmap_queue_show(&tags->breserved_tags, m); } }
@@ -509,7 +509,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->tags) - sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m); + sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock);
out: @@ -543,7 +543,7 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->sched_tags) - sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m); + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock);
out: diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 6341718c6471..dd291a21bcda 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -46,9 +46,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - sbitmap_queue_wake_all(tags->bitmap_tags); + sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) - sbitmap_queue_wake_all(tags->breserved_tags); + sbitmap_queue_wake_all(&tags->breserved_tags); }
/* @@ -158,10 +158,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } - bt = tags->breserved_tags; + bt = &tags->breserved_tags; tag_offset = 0; } else { - bt = tags->bitmap_tags; + bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; }
@@ -212,9 +212,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) - bt = tags->breserved_tags; + bt = &tags->breserved_tags; else - bt = tags->bitmap_tags; + bt = &tags->bitmap_tags;
/* * If destination hw queue is changed, fake wake up on @@ -250,10 +250,10 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags); - sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } }
@@ -404,9 +404,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags) - bt_tags_for_each(tags, tags->breserved_tags, fn, priv, + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); - bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); }
/** @@ -523,8 +523,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, continue;
if (tags->nr_reserved_tags) - bt_for_each(hctx, tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } blk_queue_exit(q); } @@ -556,24 +556,6 @@ int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, return -ENOMEM; }
-static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) -{ - int ret; - - ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, - &tags->__breserved_tags, - tags->nr_tags, tags->nr_reserved_tags, - node, alloc_policy); - if (ret) - return ret; - - tags->bitmap_tags = &tags->__bitmap_tags; - tags->breserved_tags = &tags->__breserved_tags; - - return 0; -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node, int alloc_policy) @@ -593,7 +575,9 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock);
- if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { + if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, + total_tags, reserved_tags, node, + alloc_policy) < 0) { kfree(tags); return NULL; } @@ -602,8 +586,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
void blk_mq_free_tags(struct blk_mq_tags *tags) { - sbitmap_queue_free(tags->bitmap_tags); - sbitmap_queue_free(tags->breserved_tags); + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); kfree(tags); }
@@ -652,7 +636,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ - sbitmap_queue_resize(tags->bitmap_tags, + sbitmap_queue_resize(&tags->bitmap_tags, tdepth - tags->nr_reserved_tags); }
@@ -663,12 +647,12 @@ void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int s { struct blk_mq_tags *tags = set->shared_sbitmap_tags;
- sbitmap_queue_resize(&tags->__bitmap_tags, size - set->reserved_tags); + sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); }
void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q) { - sbitmap_queue_resize(q->shared_sbitmap_tags->bitmap_tags, + sbitmap_queue_resize(&q->shared_sbitmap_tags->bitmap_tags, q->nr_requests - q->tag_set->reserved_tags); }
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 2cd9e8f14b4f..4374e1f1ee29 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -19,11 +19,8 @@ struct blk_mq_tags { */ atomic_t pending_queues;
- struct sbitmap_queue *bitmap_tags; - struct sbitmap_queue *breserved_tags; - - struct sbitmap_queue __bitmap_tags; - struct sbitmap_queue __breserved_tags; + struct sbitmap_queue bitmap_tags; + struct sbitmap_queue breserved_tags;
struct request **rqs; struct request **static_rqs; diff --git a/block/blk-mq.c b/block/blk-mq.c index a5c18b303c41..eb24773f127d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1169,14 +1169,14 @@ static inline unsigned int queued_to_index(unsigned int queued)
static bool __blk_mq_get_driver_tag(struct request *rq) { - struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; + struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag;
blk_mq_tag_busy(rq->mq_hctx);
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = rq->mq_hctx->tags->breserved_tags; + bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) @@ -1222,7 +1222,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, struct sbitmap_queue *sbq;
list_del_init(&wait->entry); - sbq = hctx->tags->bitmap_tags; + sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); @@ -1240,7 +1240,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 448ae410f510..3036f087ec7b 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -449,11 +449,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int shift = tags->bitmap_tags->sb.shift; + unsigned int shift = tags->bitmap_tags.sb.shift;
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
- sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); }
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.17-rc1 commit fea9f92f1748083cb82049ed503be30c3d3a9b69 category: bugfix bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Kashyap reports high CPU usage in blk_mq_queue_tag_busy_iter() and callees using megaraid SAS RAID card since moving to shared tags [0].
Previously, when shared tags was shared sbitmap, this function was less than optimum since we would iter through all tags for all hctx's, yet only ever match upto tagset depth number of rqs.
Since the change to shared tags, things are even less efficient if we have parallel callers of blk_mq_queue_tag_busy_iter(). This is because in bt_iter() -> blk_mq_find_and_get_req() there would be more contention on accessing each request ref and tags->lock since they are now shared among all HW queues.
Optimise by having separate calls to bt_for_each() for when we're using shared tags. In this case no longer pass a hctx, as it is no longer relevant, and teach bt_iter() about this.
Ming suggested something along the lines of this change, apart from a different implementation.
[0] https://lore.kernel.org/linux-block/e4e92abbe9d52bcba6b8cc6c91c442cc@mail.gm...
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Ming Lei ming.lei@redhat.com Reported-and-tested-by: Kashyap Desai kashyap.desai@broadcom.com Fixes: e155b0c238b2 ("blk-mq: Use shared tags for shared sbitmap support") Link: https://lore.kernel.org/r/1638794990-137490-4-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk
Conflict: block/blk-mq-tag.c Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-tag.c | 59 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 18 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index dd291a21bcda..daf7aca4700e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -259,6 +259,7 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
struct bt_iter_data { struct blk_mq_hw_ctx *hctx; + struct request_queue *q; busy_iter_fn *fn; void *data; bool reserved; @@ -282,11 +283,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; - struct blk_mq_tags *tags = hctx->tags; + struct request_queue *q = iter_data->q; + struct blk_mq_tag_set *set = q->tag_set; bool reserved = iter_data->reserved; + struct blk_mq_tags *tags; struct request *rq; bool ret = true;
+ if (blk_mq_is_sbitmap_shared(set->flags)) + tags = set->shared_sbitmap_tags; + else + tags = hctx->tags; + if (!reserved) bitnr += tags->nr_reserved_tags; /* @@ -297,7 +305,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!rq) return true;
- if (rq->q == hctx->queue && rq->mq_hctx == hctx) + if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) ret = iter_data->fn(hctx, rq, iter_data->data, reserved); blk_mq_put_rq_ref(rq); return ret; @@ -306,6 +314,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. + * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request @@ -317,14 +326,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ -static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - busy_iter_fn *fn, void *data, bool reserved) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, + struct sbitmap_queue *bt, busy_iter_fn *fn, + void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, + .q = q, };
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); @@ -501,9 +512,6 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { - struct blk_mq_hw_ctx *hctx; - int i; - /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * while the queue is frozen. So we can use q_usage_counter to avoid @@ -512,19 +520,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, if (!percpu_ref_tryget(&q->q_usage_counter)) return;
- queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_tags *tags = hctx->tags; - - /* - * If no software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + struct blk_mq_tags *tags = q->tag_set->shared_sbitmap_tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags;
if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); + bt_for_each(NULL, q, bresv, fn, priv, true); + bt_for_each(NULL, q, btags, fn, priv, false); + } else { + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; + + /* + * If no software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, q, bresv, fn, priv, true); + bt_for_each(hctx, q, btags, fn, priv, false); + } } blk_queue_exit(q); }
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 8bdf7b3fe1f48a2c1c212d4685903bba01409c0e category: bugfix bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We should not reference the queue tagset in blk_mq_sched_tags_teardown() (see function comment) for the blk-mq flags, so use the passed flags instead.
This solves a use-after-free, similarly fixed earlier (and since broken again) in commit f0c1c4d2864e ("blk-mq: fix use-after-free in blk_mq_exit_sched").
Reported-by: Linux Kernel Functional Testing lkft@linaro.org Reported-by: Naresh Kamboju naresh.kamboju@linaro.org Tested-by: Anders Roxell anders.roxell@linaro.org Fixes: e155b0c238b2 ("blk-mq: Use shared tags for shared sbitmap support") Signed-off-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/1634890340-15432-1-git-send-email-john.garry@huawe... Signed-off-by: Jens Axboe axboe@kernel.dk
Conflict: block/blk-mq-sched.c Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 36cfae6e1dd3..b3c9eae72625 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -545,7 +545,7 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int fla
queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { - if (!blk_mq_is_sbitmap_shared(q->tag_set->flags)) + if (!blk_mq_is_sbitmap_shared(flags)) blk_mq_free_rq_map(hctx->sched_tags); hctx->sched_tags = NULL; }
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 0994c64eb4159ba019e7fedc7ba0dd6a69235b40 category: bugfix bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Since it is now possible for a tagset to share a single set of tags, the iter function should not re-iter the tags for the count of #hw queues in that case. Rather it should just iter once.
Fixes: e155b0c238b2 ("blk-mq: Use shared tags for shared sbitmap support") Reported-by: Kashyap Desai kashyap.desai@broadcom.com Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Tested-by: Kashyap Desai kashyap.desai@broadcom.com Link: https://lore.kernel.org/r/1634550083-202815-1-git-send-email-john.garry@huaw... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-tag.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index daf7aca4700e..24b48a2f7fba 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -454,9 +454,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { - int i; + unsigned int flags = tagset->flags; + int i, nr_tags;
- for (i = 0; i < tagset->nr_hw_queues; i++) { + nr_tags = blk_mq_is_sbitmap_shared(flags) ? 1 : tagset->nr_hw_queues; + + for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
--------------------------------
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/blk-mq.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1707f217c636..aa0c8ef9a50f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -270,14 +270,17 @@ struct blk_mq_tag_set { unsigned int flags; void *driver_data;
- struct blk_mq_tags **tags; + KABI_DEPRECATE(atomic_t, active_queues_shared_sbitmap) + KABI_DEPRECATE(atomic_t, pending_queues_shared_sbitmap) + KABI_DEPRECATE(struct sbitmap_queue, __bitmap_tags) + KABI_DEPRECATE(struct sbitmap_queue, __breserved_tags)
- struct blk_mq_tags *shared_sbitmap_tags; + struct blk_mq_tags **tags;
struct mutex tag_list_lock; struct list_head tag_list;
- KABI_RESERVE(1) + KABI_USE(1, struct blk_mq_tags *shared_sbitmap_tags) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
--------------------------------
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-tag.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 4374e1f1ee29..b0a7892cdd9c 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -19,8 +19,12 @@ struct blk_mq_tags { */ atomic_t pending_queues;
- struct sbitmap_queue bitmap_tags; - struct sbitmap_queue breserved_tags; + KABI_DEPRECATE(struct sbitmap_queue *, bitmap_tags) + KABI_DEPRECATE(struct sbitmap_queue *, breserved_tags) + KABI_REPLACE(struct sbitmap_queue __bitmap_tags, + struct sbitmap_queue bitmap_tags) + KABI_REPLACE(struct sbitmap_queue __breserved_tags, + struct sbitmap_queue breserved_tags)
struct request **rqs; struct request **static_rqs;
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186917, https://gitee.com/openeuler/kernel/issues/I5N1S5 CVE: NA
--------------------------------
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3acfadd6724b..58475edd65f4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -499,8 +499,6 @@ struct request_queue {
atomic_t nr_active_requests_shared_sbitmap;
- struct blk_mq_tags *shared_sbitmap_tags; - struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); @@ -604,6 +602,8 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS];
+ KABI_REPLACE(unsigned long dtag_wait_time, + struct blk_mq_tags *shared_sbitmap_tags) KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3)
From: Mark Rutland mark.rutland@arm.com
mainline inclusion from mainline-v6.0-rc3 commit 2e8cff0a0eee87b27f0cf87ad8310eb41b5886ab category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5PPS3?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
On arm64, "rodata=full" has been suppored (but not documented) since commit:
c55191e96caa9d78 ("arm64: mm: apply r/o permissions of VM areas to its linear alias as well")
As it's necessary to determine the rodata configuration early during boot, arm64 has an early_param() handler for this, whereas init/main.c has a __setup() handler which is run later.
Unfortunately, this split meant that since commit:
f9a40b0890658330 ("init/main.c: return 1 from handled __setup() functions")
... passing "rodata=full" would result in a spurious warning from the __setup() handler (though RO permissions would be configured appropriately).
Further, "rodata=full" has been broken since commit:
0d6ea3ac94ca77c5 ("lib/kstrtox.c: add "false"/"true" support to kstrtobool()")
... which caused strtobool() to parse "full" as false (in addition to many other values not documented for the "rodata=" kernel parameter.
This patch fixes this breakage by:
* Moving the core parameter parser to an __early_param(), such that it is available early.
* Adding an (optional) arch hook which arm64 can use to parse "full".
* Updating the documentation to mention that "full" is valid for arm64.
* Having the core parameter parser handle "on" and "off" explicitly, such that any undocumented values (e.g. typos such as "ful") are reported as errors rather than being silently accepted.
Note that __setup() and early_param() have opposite conventions for their return values, where __setup() uses 1 to indicate a parameter was handled and early_param() uses 0 to indicate a parameter was handled.
Fixes: f9a40b089065 ("init/main.c: return 1 from handled __setup() functions") Fixes: 0d6ea3ac94ca ("lib/kstrtox.c: add "false"/"true" support to kstrtobool()") Signed-off-by: Mark Rutland mark.rutland@arm.com Cc: Andy Shevchenko andy.shevchenko@gmail.com Cc: Ard Biesheuvel ardb@kernel.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Jagdish Gediya jvgediya@linux.ibm.com Cc: Matthew Wilcox willy@infradead.org Cc: Randy Dunlap rdunlap@infradead.org Cc: Will Deacon will@kernel.org Reviewed-by: Ard Biesheuvel ardb@kernel.org Link: https://lore.kernel.org/r/20220817154022.3974645-1-mark.rutland@arm.com Signed-off-by: Will Deacon will@kernel.org
conflicts: Documentation/admin-guide/kernel-parameters.txt arch/arm64/include/asm/setup.h arch/arm64/mm/mmu.c init/main.c
Signed-off-by: Xia Longlong xialonglong1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 2 ++ arch/arm64/include/asm/setup.h | 24 +++++++++++++++++++ arch/arm64/mm/mmu.c | 18 -------------- init/main.c | 18 +++++++++++--- 4 files changed, 41 insertions(+), 21 deletions(-) create mode 100644 arch/arm64/include/asm/setup.h
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 216b28514abe..1d7650717674 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4879,6 +4879,8 @@ rodata= [KNL] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. + full Mark read-only kernel memory and aliases as read-only + [arm64]
rockchip.usb_uart Enable the uart passthrough on the designated usb port diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h new file mode 100644 index 000000000000..29bcb5bb45a3 --- /dev/null +++ b/arch/arm64/include/asm/setup.h @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __ARM64_ASM_SETUP_H +#define __ARM64_ASM_SETUP_H + +#include <linux/string.h> +#include <uapi/asm/setup.h> + +static inline bool arch_parse_debug_rodata(char *arg) +{ + extern bool rodata_enabled; + extern bool rodata_full; + + if (arg && !strcmp(arg, "full")) { + rodata_enabled = true; + rodata_full = true; + return true; + } + + return false; +} +#define arch_parse_debug_rodata arch_parse_debug_rodata + +#endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a31f2124705e..78b9e489d8f6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -632,24 +632,6 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, vm_area_add_early(vma); }
-static int __init parse_rodata(char *arg) -{ - int ret = strtobool(arg, &rodata_enabled); - if (!ret) { - rodata_full = false; - return 0; - } - - /* permit 'full' in addition to boolean options */ - if (strcmp(arg, "full")) - return -EINVAL; - - rodata_enabled = true; - rodata_full = true; - return 0; -} -early_param("rodata", parse_rodata); - #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { diff --git a/init/main.c b/init/main.c index a60dc38e081a..7f4e8a8964b1 100644 --- a/init/main.c +++ b/init/main.c @@ -1368,13 +1368,25 @@ static noinline void __init kernel_init_freeable(void);
#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) bool rodata_enabled __ro_after_init = true; + +#ifndef arch_parse_debug_rodata +static inline bool arch_parse_debug_rodata(char *str) { return false; } +#endif + static int __init set_debug_rodata(char *str) { - if (strtobool(str, &rodata_enabled)) + if (arch_parse_debug_rodata(str)) + return 0; + + if (str && !strcmp(str, "on")) + rodata_enabled = true; + else if (str && !strcmp(str, "off")) + rodata_enabled = false; + else pr_warn("Invalid option string for rodata: '%s'\n", str); - return 1; + return 0; } -__setup("rodata=", set_debug_rodata); +early_param("rodata", set_debug_rodata); #endif
#ifdef CONFIG_STRICT_KERNEL_RWX
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5X6RT CVE: NA
--------------------------------
After introducing commit 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting"), '%util' accounted by iostat will be over reality data. In fact, the device is quite idle, but iostat may show '%util' as a big number (e.g. 50%). It can produce by fio:
fio --name=1 --direct=1 --bs=4k --rw=read --filename=/dev/sda \ --thinktime=4ms --runtime=180 We fix this by using a switch(precise_iostat=1) to control whether or not acconut ioticks precisely.
fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 28 ++++++++++++++++++++++++++-- block/blk-merge.c | 2 ++ block/genhd.c | 2 +- include/linux/blkdev.h | 1 + include/linux/genhd.h | 1 + 5 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 7b6d5c8c036f..a4ec5e168312 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -62,6 +62,21 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
DEFINE_IDA(blk_queue_ida);
+bool precise_iostat; + +static int __init precise_iostat_setup(char *str) +{ + bool precise; + + if (!strtobool(str, &precise)) { + precise_iostat = precise; + pr_info("precise iostat %d\n", precise_iostat); + } + + return 1; +} +__setup("precise_iostat=", precise_iostat_setup); + /* * For queue allocation */ @@ -1258,9 +1273,14 @@ void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) unsigned long stamp; again: stamp = READ_ONCE(part->stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) + if (unlikely(time_after(now, stamp)) && + likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { + if (precise_iostat) { + if (end || part_in_flight(part)) + __part_stat_add(part, io_ticks, now - stamp); + } else { __part_stat_add(part, io_ticks, end ? now - stamp : 1); + } } if (part->partno) { part = &part_to_disk(part)->part0; @@ -1318,6 +1338,8 @@ void blk_account_io_done(struct request *req, u64 now) #else part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); #endif + if (precise_iostat) + part_stat_local_dec(part, in_flight[rq_data_dir(req)]); part_stat_unlock();
hd_struct_put(part); @@ -1333,6 +1355,8 @@ void blk_account_io_start(struct request *rq)
part_stat_lock(); update_io_ticks(rq->part, jiffies, false); + if (precise_iostat) + part_stat_local_inc(rq->part, in_flight[rq_data_dir(rq)]); part_stat_unlock(); }
diff --git a/block/blk-merge.c b/block/blk-merge.c index 6518e0ae2835..cfb88d2fbf38 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -696,6 +696,8 @@ static void blk_account_io_merge_request(struct request *req) if (blk_do_io_stat(req)) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + if (precise_iostat) + part_stat_local_dec(req->part, in_flight[rq_data_dir(req)]); part_stat_unlock();
hd_struct_put(req->part); diff --git a/block/genhd.c b/block/genhd.c index 70e24c554b31..a5a6840f2e85 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -112,7 +112,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } }
-static unsigned int part_in_flight(struct hd_struct *part) +unsigned int part_in_flight(struct hd_struct *part) { unsigned int inflight = 0; int cpu; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 58475edd65f4..e4bcb11d6202 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -29,6 +29,7 @@ #include <linux/pm.h> #include <linux/sbitmap.h>
+extern bool precise_iostat; struct module; struct scsi_ioctl_command;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 05927a1c6b5b..959add98b686 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -304,6 +304,7 @@ extern void disk_part_iter_exit(struct disk_part_iter *piter); extern bool disk_has_partitions(struct gendisk *disk);
/* block/genhd.c */ +extern unsigned int part_in_flight(struct hd_struct *part); extern void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); static inline void add_disk(struct gendisk *disk)
From: Kuniyuki Iwashima kuniyu@amazon.com
mainline inclusion from mainline-v6.0-rc3 commit 9c80e79906b4ca440d09e7f116609262bb747909 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5Q4Z0 CVE: NA
--------------------------------
The assumption in __disable_kprobe() is wrong, and it could try to disarm an already disarmed kprobe and fire the WARN_ONCE() below. [0] We can easily reproduce this issue.
1. Write 0 to /sys/kernel/debug/kprobes/enabled.
# echo 0 > /sys/kernel/debug/kprobes/enabled
2. Run execsnoop. At this time, one kprobe is disabled.
# /usr/share/bcc/tools/execsnoop & [1] 2460 PCOMM PID PPID RET ARGS
# cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE]
3. Write 1 to /sys/kernel/debug/kprobes/enabled, which changes kprobes_all_disarmed to false but does not arm the disabled kprobe.
# echo 1 > /sys/kernel/debug/kprobes/enabled
# cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE]
4. Kill execsnoop, when __disable_kprobe() calls disarm_kprobe() for the disabled kprobe and hits the WARN_ONCE() in __disarm_kprobe_ftrace().
# fg /usr/share/bcc/tools/execsnoop ^C
Actually, WARN_ONCE() is fired twice, and __unregister_kprobe_top() misses some cleanups and leaves the aggregated kprobe in the hash table. Then, __unregister_trace_kprobe() initialises tk->rp.kp.list and creates an infinite loop like this.
aggregated kprobe.list -> kprobe.list -. ^ | '.__.'
In this situation, these commands fall into the infinite loop and result in RCU stall or soft lockup.
cat /sys/kernel/debug/kprobes/list : show_kprobe_addr() enters into the infinite loop with RCU.
/usr/share/bcc/tools/execsnoop : warn_kprobe_rereg() holds kprobe_mutex, and __get_valid_kprobe() is stuck in the loop.
To avoid the issue, make sure we don't call disarm_kprobe() for disabled kprobes.
[0] Failed to disarm kprobe-ftrace at __x64_sys_execve+0x0/0x40 (error -2) WARNING: CPU: 6 PID: 2460 at kernel/kprobes.c:1130 __disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Modules linked in: ena CPU: 6 PID: 2460 Comm: execsnoop Not tainted 5.19.0+ #28 Hardware name: Amazon EC2 c5.2xlarge/, BIOS 1.0 10/16/2017 RIP: 0010:__disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Code: 24 8b 02 eb c1 80 3d c4 83 f2 01 00 75 d4 48 8b 75 00 89 c2 48 c7 c7 90 fa 0f 92 89 04 24 c6 05 ab 83 01 e8 e4 94 f0 ff <0f> 0b 8b 04 24 eb b1 89 c6 48 c7 c7 60 fa 0f 92 89 04 24 e8 cc 94 RSP: 0018:ffff9e6ec154bd98 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff930f7b00 RCX: 0000000000000001 RDX: 0000000080000001 RSI: ffffffff921461c5 RDI: 00000000ffffffff RBP: ffff89c504286da8 R08: 0000000000000000 R09: c0000000fffeffff R10: 0000000000000000 R11: ffff9e6ec154bc28 R12: ffff89c502394e40 R13: ffff89c502394c00 R14: ffff9e6ec154bc00 R15: 0000000000000000 FS: 00007fe800398740(0000) GS:ffff89c812d80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c00057f010 CR3: 0000000103b54006 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: <TASK> __disable_kprobe (kernel/kprobes.c:1716) disable_kprobe (kernel/kprobes.c:2392) __disable_trace_kprobe (kernel/trace/trace_kprobe.c:340) disable_trace_kprobe (kernel/trace/trace_kprobe.c:429) perf_trace_event_unreg.isra.2 (./include/linux/tracepoint.h:93 kernel/trace/trace_event_perf.c:168) perf_kprobe_destroy (kernel/trace/trace_event_perf.c:295) _free_event (kernel/events/core.c:4971) perf_event_release_kernel (kernel/events/core.c:5176) perf_release (kernel/events/core.c:5186) __fput (fs/file_table.c:321) task_work_run (./include/linux/sched.h:2056 (discriminator 1) kernel/task_work.c:179 (discriminator 1)) exit_to_user_mode_prepare (./include/linux/resume_user_mode.h:49 kernel/entry/common.c:169 kernel/entry/common.c:201) syscall_exit_to_user_mode (./arch/x86/include/asm/jump_label.h:55 ./arch/x86/include/asm/nospec-branch.h:384 ./arch/x86/include/asm/entry-common.h:94 kernel/entry/common.c:133 kernel/entry/common.c:296) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7fe7ff210654 Code: 15 79 89 20 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb be 0f 1f 00 8b 05 9a cd 20 00 48 63 ff 85 c0 75 11 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3a f3 c3 48 83 ec 18 48 89 7c 24 08 e8 34 fc RSP: 002b:00007ffdbd1d3538 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007fe7ff210654 RDX: 0000000000000000 RSI: 0000000000002401 RDI: 0000000000000008 RBP: 0000000000000000 R08: 94ae31d6fda838a4 R0900007fe8001c9d30 R10: 00007ffdbd1d34b0 R11: 0000000000000246 R12: 00007ffdbd1d3600 R13: 0000000000000000 R14: fffffffffffffffc R15: 00007ffdbd1d3560 </TASK>
Link: https://lkml.kernel.org/r/20220813020509.90805-1-kuniyu@amazon.com Fixes: 69d54b916d83 ("kprobes: makes kprobes/enabled works correctly for optimized kprobes.") Signed-off-by: Kuniyuki Iwashima kuniyu@amazon.com Reported-by: Ayushman Dutta ayudutta@amazon.com Cc: "Naveen N. Rao" naveen.n.rao@linux.ibm.com Cc: Anil S Keshavamurthy anil.s.keshavamurthy@intel.com Cc: "David S. Miller" davem@davemloft.net Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Wang Nan wangnan0@huawei.com Cc: Kuniyuki Iwashima kuniyu@amazon.com Cc: Kuniyuki Iwashima kuni1840@gmail.com Cc: Ayushman Dutta ayudutta@amazon.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org
Signed-off-by: Chen Zhongjin chenzhongjin@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/kprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f7695a6eb724..9db84949fc91 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1785,11 +1785,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If kprobes_all_disarmed is set, orig_p - * should have already been disarmed, so - * skip unneed disarming process. + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. */ - if (!kprobes_all_disarmed) { + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED;
From: Chen Zhongjin chenzhongjin@huawei.com
mainline inclusion from mainline-v6.0-rc3 commit fc2e426b1161761561624ebd43ce8c8d2fa058da category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5Q4RA CVE: NA
--------------------------------
When meeting ftrace trampolines in ORC unwinding, unwinder uses address of ftrace_{regs_}call address to find the ORC entry, which gets next frame at sp+176.
If there is an IRQ hitting at sub $0xa8,%rsp, the next frame should be sp+8 instead of 176. It makes unwinder skip correct frame and throw warnings such as "wrong direction" or "can't access registers", etc, depending on the content of the incorrect frame address.
By adding the base address ftrace_{regs_}caller with the offset *ip - ops->trampoline*, we can get the correct address to find the ORC entry.
Also change "caller" to "tramp_addr" to make variable name conform to its content.
[ mingo: Clarified the changelog a bit. ]
Fixes: 6be7fa3c74d1 ("ftrace, orc, x86: Handle ftrace dynamically allocated trampolines") Signed-off-by: Chen Zhongjin chenzhongjin@huawei.com Signed-off-by: Ingo Molnar mingo@kernel.org Reviewed-by: Steven Rostedt (Google) rostedt@goodmis.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220819084334.244016-1-chenzhongjin@huawei.com Signed-off-by: Chen Zhongjin chenzhongjin@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/unwind_orc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index c451d5f6422f..cc071c4c6524 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -93,22 +93,27 @@ static struct orc_entry *orc_find(unsigned long ip); static struct orc_entry *orc_ftrace_find(unsigned long ip) { struct ftrace_ops *ops; - unsigned long caller; + unsigned long tramp_addr, offset;
ops = ftrace_ops_trampoline(ip); if (!ops) return NULL;
+ /* Set tramp_addr to the start of the code copied by the trampoline */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) - caller = (unsigned long)ftrace_regs_call; + tramp_addr = (unsigned long)ftrace_regs_caller; else - caller = (unsigned long)ftrace_call; + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset;
/* Prevent unlikely recursion */ - if (ip == caller) + if (ip == tramp_addr) return NULL;
- return orc_find(caller); + return orc_find(tramp_addr); } #else static struct orc_entry *orc_ftrace_find(unsigned long ip)
From: "Russell King (Oracle)" rmk+kernel@armlinux.org.uk
mainline inclusion from mainline-v6.0-rc7 commit 0152dfee235e87660f52a117fc9f70dc55956bb4 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5W7AX CVE: CVE-2022-3535
--------------------------------
When mvpp2 is unloaded, the driver specific debugfs directory is not removed, which technically leads to a memory leak. However, this directory is only created when the first device is probed, so the hardware is present. Removing the module is only something a developer would to when e.g. testing out changes, so the module would be reloaded. So this memory leak is minor.
The original attempt in commit fe2c9c61f668 ("net: mvpp2: debugfs: fix memory leak when using debugfs_lookup()") that was labelled as a memory leak fix was not, it fixed a refcount leak, but in doing so created a problem when the module is reloaded - the directory already exists, but mvpp2_root is NULL, so we lose all debugfs entries. This fix has been reverted.
This is the alternative fix, where we remove the offending directory whenever the driver is unloaded.
Fixes: 21da57a23125 ("net: mvpp2: add a debugfs interface for the Header Parser") Signed-off-by: Russell King (Oracle) rmk+kernel@armlinux.org.uk Reviewed-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Reviewed-by: Marcin Wojtas mw@semihalf.com Link: https://lore.kernel.org/r/E1ofOAB-00CzkG-UO@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: zhangqiao zhangqiao22@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/marvell/mvpp2/mvpp2.h | 1 + drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c | 10 ++++++++-- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 13 ++++++++++++- 3 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h index d825eb021b22..e999ac2de34e 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h @@ -1434,6 +1434,7 @@ u32 mvpp2_read(struct mvpp2 *priv, u32 offset); void mvpp2_dbgfs_init(struct mvpp2 *priv, const char *name);
void mvpp2_dbgfs_cleanup(struct mvpp2 *priv); +void mvpp2_dbgfs_exit(void);
#ifdef CONFIG_MVPP2_PTP int mvpp22_tai_probe(struct device *dev, struct mvpp2 *priv); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c index 4a3baa7e0142..75e83ea2a926 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c @@ -691,6 +691,13 @@ static int mvpp2_dbgfs_port_init(struct dentry *parent, return 0; }
+static struct dentry *mvpp2_root; + +void mvpp2_dbgfs_exit(void) +{ + debugfs_remove(mvpp2_root); +} + void mvpp2_dbgfs_cleanup(struct mvpp2 *priv) { debugfs_remove_recursive(priv->dbgfs_dir); @@ -700,10 +707,9 @@ void mvpp2_dbgfs_cleanup(struct mvpp2 *priv)
void mvpp2_dbgfs_init(struct mvpp2 *priv, const char *name) { - struct dentry *mvpp2_dir, *mvpp2_root; + struct dentry *mvpp2_dir; int ret, i;
- mvpp2_root = debugfs_lookup(MVPP2_DRIVER_NAME, NULL); if (!mvpp2_root) mvpp2_root = debugfs_create_dir(MVPP2_DRIVER_NAME, NULL);
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 9d5c06a433a6..765ec3bc11ad 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -7171,7 +7171,18 @@ static struct platform_driver mvpp2_driver = { }, };
-module_platform_driver(mvpp2_driver); +static int __init mvpp2_driver_init(void) +{ + return platform_driver_register(&mvpp2_driver); +} +module_init(mvpp2_driver_init); + +static void __exit mvpp2_driver_exit(void) +{ + platform_driver_unregister(&mvpp2_driver); + mvpp2_dbgfs_exit(); +} +module_exit(mvpp2_driver_exit);
MODULE_DESCRIPTION("Marvell PPv2 Ethernet Driver - www.marvell.com"); MODULE_AUTHOR("Marcin Wojtas mw@semihalf.com");
From: Ryusuke Konishi konishi.ryusuke@gmail.com
mainline inclusion from mainline-v6.1-rc1 commit d0d51a97063db4704a5ef6bc978dddab1636a306 category: bugfix bugzilla: 187884,https://gitee.com/src-openeuler/kernel/issues/I5X2OB CVE: CVE-2022-3646
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If nilfs_attach_log_writer() failed to create a log writer thread, it frees a data structure of the log writer without any cleanup. After commit e912a5b66837 ("nilfs2: use root object to get ifile"), this causes a leak of struct nilfs_root, which started to leak an ifile metadata inode and a kobject on that struct.
In addition, if the kernel is booted with panic_on_warn, the above ifile metadata inode leak will cause the following panic when the nilfs2 kernel module is removed:
kmem_cache_destroy nilfs2_inode_cache: Slab cache still has objects when called from nilfs_destroy_cachep+0x16/0x3a [nilfs2] WARNING: CPU: 8 PID: 1464 at mm/slab_common.c:494 kmem_cache_destroy+0x138/0x140 ... RIP: 0010:kmem_cache_destroy+0x138/0x140 Code: 00 20 00 00 e8 a9 55 d8 ff e9 76 ff ff ff 48 8b 53 60 48 c7 c6 20 70 65 86 48 c7 c7 d8 69 9c 86 48 8b 4c 24 28 e8 ef 71 c7 00 <0f> 0b e9 53 ff ff ff c3 48 81 ff ff 0f 00 00 77 03 31 c0 c3 53 48 ... Call Trace: <TASK> ? nilfs_palloc_freev.cold.24+0x58/0x58 [nilfs2] nilfs_destroy_cachep+0x16/0x3a [nilfs2] exit_nilfs_fs+0xa/0x1b [nilfs2] __x64_sys_delete_module+0x1d9/0x3a0 ? __sanitizer_cov_trace_pc+0x1a/0x50 ? syscall_trace_enter.isra.19+0x119/0x190 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ... </TASK> Kernel panic - not syncing: panic_on_warn set ...
This patch fixes these issues by calling nilfs_detach_log_writer() cleanup function if spawning the log writer thread fails.
Link: https://lkml.kernel.org/r/20221007085226.57667-1-konishi.ryusuke@gmail.com Fixes: e912a5b66837 ("nilfs2: use root object to get ifile") Signed-off-by: Ryusuke Konishi konishi.ryusuke@gmail.com Reported-by: syzbot+7381dc4ad60658ca4c05@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi konishi.ryusuke@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/nilfs2/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 8350c2eaee75..977e0271bd4c 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2791,10 +2791,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
err = nilfs_segctor_start_thread(nilfs->ns_writer); - if (err) { - kfree(nilfs->ns_writer); - nilfs->ns_writer = NULL; - } + if (unlikely(err)) + nilfs_detach_log_writer(sb); + return err; }
From: Pablo Neira Ayuso pablo@netfilter.org
stable inclusion from stable-v5.10.146 commit 5d75fef3e61e797fab5c3fbba88caa74ab92ad47 category: bugfix bugzilla: 187890, https://gitee.com/src-openeuler/kernel/issues/I5X2IJ CVE: CVE-2022-42432
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 559c36c5a8d730c49ef805a72b213d3bba155cc8 ]
nf_osf_find() incorrectly returns true on mismatch, this leads to copying uninitialized memory area in nft_osf which can be used to leak stale kernel stack data to userspace.
Fixes: 22c7652cdaa8 ("netfilter: nft_osf: Add version option support") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/nfnetlink_osf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 79fbf37291f3..51e3953b414c 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb, struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; + bool found = false;
memset(&ctx, 0, sizeof(ctx));
@@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb,
data->genre = f->genre; data->version = f->version; + found = true; break; }
- return true; + return found; } EXPORT_SYMBOL_GPL(nf_osf_find);
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 187821,https://gitee.com/openeuler/kernel/issues/I5X9U0 CVE: NA
--------------------------------
We got a issue as fllows:
================================================================== kernel BUG at fs/ext4/extents_status.c:202! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 1 PID: 810 Comm: mount Not tainted 6.1.0-rc1-next-g9631525255e3 #352 RIP: 0010:__es_tree_search.isra.0+0xb8/0xe0 RSP: 0018:ffffc90001227900 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000077512a0f RCX: 0000000000000000 RDX: 0000000000000002 RSI: 0000000000002a10 RDI: ffff8881004cd0c8 RBP: ffff888177512ac8 R08: 47ffffffffffffff R09: 0000000000000001 R10: 0000000000000001 R11: 00000000000679af R12: 0000000000002a10 R13: ffff888177512d88 R14: 0000000077512a10 R15: 0000000000000000 FS: 00007f4bd76dbc40(0000)GS:ffff88842fd00000(0000)knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005653bf993cf8 CR3: 000000017bfdf000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ext4_es_cache_extent+0xe2/0x210 ext4_cache_extents+0xd2/0x110 ext4_find_extent+0x5d5/0x8c0 ext4_ext_map_blocks+0x9c/0x1d30 ext4_map_blocks+0x431/0xa50 ext4_getblk+0x82/0x340 ext4_bread+0x14/0x110 ext4_quota_read+0xf0/0x180 v2_read_header+0x24/0x90 v2_check_quota_file+0x2f/0xa0 dquot_load_quota_sb+0x26c/0x760 dquot_load_quota_inode+0xa5/0x190 ext4_enable_quotas+0x14c/0x300 __ext4_fill_super+0x31cc/0x32c0 ext4_fill_super+0x115/0x2d0 get_tree_bdev+0x1d2/0x360 ext4_get_tree+0x19/0x30 vfs_get_tree+0x26/0xe0 path_mount+0x81d/0xfc0 do_mount+0x8d/0xc0 __x64_sys_mount+0xc0/0x160 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd </TASK> ==================================================================
Above issue may happen as follows: ------------------------------------- ext4_fill_super ext4_orphan_cleanup ext4_enable_quotas ext4_quota_enable ext4_iget --> get error inode <5> ext4_ext_check_inode --> Wrong imode makes it escape inspection make_bad_inode(inode) --> EXT4_BOOT_LOADER_INO set imode dquot_load_quota_inode vfs_setup_quota_inode --> check pass dquot_load_quota_sb v2_check_quota_file v2_read_header ext4_quota_read ext4_bread ext4_getblk ext4_map_blocks ext4_ext_map_blocks ext4_find_extent ext4_cache_extents ext4_es_cache_extent __es_tree_search.isra.0 ext4_es_end --> Wrong extents trigger BUG_ON
In the above issue, s_usr_quota_inum is set to 5, but inode<5> contains incorrect imode and disordered extents. Because 5 is EXT4_BOOT_LOADER_INO, the ext4_ext_check_inode check in the ext4_iget function can be bypassed, finally, the extents that are not checked trigger the BUG_ON in the __es_tree_search function. To solve this issue, check whether the inode is bad_inode in vfs_setup_quota_inode().
Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/quota/dquot.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 65f123d5809b..ad255f8ab5c5 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2319,6 +2319,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type) struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb);
+ if (is_bad_inode(inode)) + return -EUCLEAN; if (!S_ISREG(inode->i_mode)) return -EACCES; if (IS_RDONLY(inode))
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 187821,https://gitee.com/openeuler/kernel/issues/I5X9U0 CVE: NA
--------------------------------
Before quota is enabled, a check on the preset quota inums in ext4_super_block is added to prevent wrong quota inodes from being loaded. In addition, when the quota fails to be enabled, the quota type and quota inum are printed to facilitate fault locating.
Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/super.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4b99faccd182..3bb1a70c2db0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6488,6 +6488,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return err; }
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) +{ + switch (type) { + case USRQUOTA: + return qf_inum == EXT4_USR_QUOTA_INO; + case GRPQUOTA: + return qf_inum == EXT4_GRP_QUOTA_INO; + case PRJQUOTA: + return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; + default: + BUG(); + } +} + static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { @@ -6504,9 +6518,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM;
+ if (!ext4_check_quota_inum(type, qf_inums[type])) { + ext4_error(sb, "Bad quota inum: %lu, type: %d", + qf_inums[type], type); + return -EUCLEAN; + } + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { - ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + ext4_error(sb, "Bad quota inode: %lu, type: %d", + qf_inums[type], type); return PTR_ERR(qf_inode); }
@@ -6545,8 +6566,9 @@ static int ext4_enable_quotas(struct super_block *sb) if (err) { ext4_warning(sb, "Failed to enable quota tracking " - "(type=%d, err=%d). Please run " - "e2fsck to fix.", type, err); + "(type=%d, err=%d, ino=%lu). " + "Please run e2fsck to fix.", type, + err, qf_inums[type]); for (type--; type >= 0; type--) { struct inode *inode;
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 187821,https://gitee.com/openeuler/kernel/issues/I5X9U0 CVE: NA
--------------------------------
There are many places that will get unhappy (and crash) when ext4_iget() returns a bad inode. However, if iget the boot loader inode, allows a bad inode to be returned, because the inode may not be initialized. This mechanism can be used to bypass some checks and cause panic. To solve this problem, we add a special iget flag EXT4_IGET_BAD. Only with this flag we'd be returning bad inode from ext4_iget(), otherwise we always return the error code if the inode is bad inode.(suggested by Jan Kara)
Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/ext4.h | 3 ++- fs/ext4/inode.c | 8 +++++++- fs/ext4/ioctl.c | 3 ++- 3 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01e7b3f1aa1a..8245b94d8fc6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2876,7 +2876,8 @@ int do_journal_get_write_access(handle_t *handle, typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ - EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ + EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ + EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */ } ext4_iget_flags;
extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 23e8741408ac..3a8ec8a50b90 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5028,8 +5028,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); - brelse(iloc.bh); + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + ext4_error_inode(inode, function, line, 0, + "bad inode without EXT4_IGET_BAD flag"); + ret = -EUCLEAN; + goto bad_inode; + }
+ brelse(iloc.bh); unlock_new_inode(inode); return inode;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 413bf3d2f784..879e637ef3e1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -121,7 +121,8 @@ static long swap_inode_boot_loader(struct super_block *sb, blkcnt_t blocks; unsigned short bytes;
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, + EXT4_IGET_SPECIAL | EXT4_IGET_BAD); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl);
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 187821,https://gitee.com/openeuler/kernel/issues/I5X9U0 CVE: NA
--------------------------------
We got a issue as fllows:
================================================================== kernel BUG at fs/ext4/extents_status.c:203! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 1 PID: 945 Comm: cat Not tainted 6.0.0-next-20221007-dirty #349 RIP: 0010:ext4_es_end.isra.0+0x34/0x42 RSP: 0018:ffffc9000143b768 EFLAGS: 00010203 RAX: 0000000000000000 RBX: ffff8881769cd0b8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8fc27cf7 RDI: 00000000ffffffff RBP: ffff8881769cd0bc R08: 0000000000000000 R09: ffffc9000143b5f8 R10: 0000000000000001 R11: 0000000000000001 R12: ffff8881769cd0a0 R13: ffff8881768e5668 R14: 00000000768e52f0 R15: 0000000000000000 FS: 00007f359f7f05c0(0000)GS:ffff88842fd00000(0000)knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f359f5a2000 CR3: 000000017130c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> __es_tree_search.isra.0+0x6d/0xf5 ext4_es_cache_extent+0xfa/0x230 ext4_cache_extents+0xd2/0x110 ext4_find_extent+0x5d5/0x8c0 ext4_ext_map_blocks+0x9c/0x1d30 ext4_map_blocks+0x431/0xa50 ext4_mpage_readpages+0x48e/0xe40 ext4_readahead+0x47/0x50 read_pages+0x82/0x530 page_cache_ra_unbounded+0x199/0x2a0 do_page_cache_ra+0x47/0x70 page_cache_ra_order+0x242/0x400 ondemand_readahead+0x1e8/0x4b0 page_cache_sync_ra+0xf4/0x110 filemap_get_pages+0x131/0xb20 filemap_read+0xda/0x4b0 generic_file_read_iter+0x13a/0x250 ext4_file_read_iter+0x59/0x1d0 vfs_read+0x28f/0x460 ksys_read+0x73/0x160 __x64_sys_read+0x1e/0x30 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd </TASK> ==================================================================
In the above issue, ioctl invokes the swap_inode_boot_loader function to swap inode<5> and inode<12>. However, inode<5> contain incorrect imode and disordered extents, and i_nlink is set to 1. The extents check for inode in the ext4_iget function can be bypassed bacause 5 is EXT4_BOOT_LOADER_INO. While links_count is set to 1, the extents are not initialized in swap_inode_boot_loader. After the ioctl command is executed successfully, the extents are swapped to inode<12>, in this case, run the `cat` command to view inode<12>. And Bug_ON is triggered due to the incorrect extents.
When the boot loader inode is not initialized, its imode can be one of the following: 1) the imode is a bad type, which is marked as bad_inode in ext4_iget and set to S_IFREG. 2) the imode is good type but not S_IFREG. 3) the imode is S_IFREG.
The BUG_ON may be triggered by bypassing the check in cases 1 and 2. Therefore, when the boot loader inode is bad_inode or its imode is not S_IFREG, initialize the inode to avoid triggering the BUG.
Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 879e637ef3e1..9a3d893cd390 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -171,7 +171,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl);
- if (inode_bl->i_nlink == 0) { + if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { /* this inode has never been used as a BOOT_LOADER */ set_nlink(inode_bl, 1); i_uid_write(inode_bl, 0);
From: Peilin Ye peilin.ye@bytedance.com
stable inclusion from stable-v5.10.138 commit 38ddccbda5e8b762c8ee06670bb1f64f1be5ee50 category: bugfix bugzilla: 187891, https://gitee.com/src-openeuler/kernel/issues/I5WYLL CVE: CVE-2022-3629
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 7e97cfed9929eaabc41829c395eb0d1350fccb9d upstream.
An O_NONBLOCK vsock_connect() request may try to reschedule @connect_work. Imagine the following sequence of vsock_connect() requests:
1. The 1st, non-blocking request schedules @connect_work, which will expire after 200 jiffies. Socket state is now SS_CONNECTING;
2. Later, the 2nd, blocking request gets interrupted by a signal after a few jiffies while waiting for the connection to be established. Socket state is back to SS_UNCONNECTED, but @connect_work is still pending, and will expire after 100 jiffies.
3. Now, the 3rd, non-blocking request tries to schedule @connect_work again. Since @connect_work is already scheduled, schedule_delayed_work() silently returns. sock_hold() is called twice, but sock_put() will only be called once in vsock_connect_timeout(), causing a memory leak reported by syzbot:
BUG: memory leak unreferenced object 0xffff88810ea56a40 (size 1232): comm "syz-executor756", pid 3604, jiffies 4294947681 (age 12.350s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 28 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 (..@............ backtrace: [<ffffffff837c830e>] sk_prot_alloc+0x3e/0x1b0 net/core/sock.c:1930 [<ffffffff837cbe22>] sk_alloc+0x32/0x2e0 net/core/sock.c:1989 [<ffffffff842ccf68>] __vsock_create.constprop.0+0x38/0x320 net/vmw_vsock/af_vsock.c:734 [<ffffffff842ce8f1>] vsock_create+0xc1/0x2d0 net/vmw_vsock/af_vsock.c:2203 [<ffffffff837c0cbb>] __sock_create+0x1ab/0x2b0 net/socket.c:1468 [<ffffffff837c3acf>] sock_create net/socket.c:1519 [inline] [<ffffffff837c3acf>] __sys_socket+0x6f/0x140 net/socket.c:1561 [<ffffffff837c3bba>] __do_sys_socket net/socket.c:1570 [inline] [<ffffffff837c3bba>] __se_sys_socket net/socket.c:1568 [inline] [<ffffffff837c3bba>] __x64_sys_socket+0x1a/0x20 net/socket.c:1568 [<ffffffff84512815>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<ffffffff84512815>] do_syscall_64+0x35/0x80 arch/x86/entry/common.c:80 [<ffffffff84600068>] entry_SYSCALL_64_after_hwframe+0x44/0xae <...>
Use mod_delayed_work() instead: if @connect_work is already scheduled, reschedule it, and undo sock_hold() to keep the reference count balanced.
Reported-and-tested-by: syzbot+b03f55bf128f9a38f064@syzkaller.appspotmail.com Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Co-developed-by: Stefano Garzarella sgarzare@redhat.com Signed-off-by: Stefano Garzarella sgarzare@redhat.com Reviewed-by: Stefano Garzarella sgarzare@redhat.com Signed-off-by: Peilin Ye peilin.ye@bytedance.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Dong Chenchen dongchenchen2@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/vmw_vsock/af_vsock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index c59806253a65..2811f0e45257 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1347,7 +1347,14 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, * timeout fires. */ sock_hold(sk); - schedule_delayed_work(&vsk->connect_work, timeout); + + /* If the timeout function is already scheduled, + * reschedule it, then ungrab the socket refcount to + * keep it balanced. + */ + if (mod_delayed_work(system_wq, &vsk->connect_work, + timeout)) + sock_put(sk);
/* Skip ahead to preserve error code set above. */ goto out_wait;
From: Alistair Popple apopple@nvidia.com
mainline inclusion from mainline-v6.1-rc1 commit 16ce101db85db694a91380aa4c89b25530871d33 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5VZ0L CVE: CVE-2022-3523
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Patch series "Fix several device private page reference counting issues", v2
This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed.
During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory.
This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems.
This patch (of 8):
When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages().
Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not.
[mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.166... Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.166436629... Signed-off-by: Alistair Popple apopple@nvidia.com Acked-by: Felix Kuehling Felix.Kuehling@amd.com Cc: Jason Gunthorpe jgg@nvidia.com Cc: John Hubbard jhubbard@nvidia.com Cc: Ralph Campbell rcampbell@nvidia.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Lyude Paul lyude@redhat.com Cc: Alex Deucher alexander.deucher@amd.com Cc: Alex Sierra alex.sierra@amd.com Cc: Ben Skeggs bskeggs@redhat.com Cc: Christian König christian.koenig@amd.com Cc: Dan Williams dan.j.williams@intel.com Cc: David Hildenbrand david@redhat.com Cc: "Huang, Ying" ying.huang@intel.com Cc: Matthew Wilcox willy@infradead.org Cc: Yang Shi shy828301@gmail.com Cc: Zi Yan ziy@nvidia.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: arch/powerpc/kvm/book3s_hv_uvmem.c include/linux/migrate.h lib/test_hmm.c mm/migrate.c Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/powerpc/kvm/book3s_hv_uvmem.c | 19 ++++++++------ include/linux/migrate.h | 9 +++++++ lib/test_hmm.c | 5 ++-- mm/memory.c | 16 +++++++++++- mm/migrate.c | 42 ++++++++++++++++++++---------- 5 files changed, 66 insertions(+), 25 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 3dd58b4ee33e..db17e5f5d431 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -506,10 +506,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) static int __kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, struct page *fault_page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *dpage, *spage; struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn; @@ -523,6 +523,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, mig.dst = &dst_pfn; mig.pgmap_owner = &kvmppc_uvmem_pgmap; mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + mig.fault_page = fault_page;
/* The requested page is already paged-out, nothing to do */ if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) @@ -578,12 +579,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, + struct page *fault_page) { int ret;
mutex_lock(&kvm->arch.uvmem_lock); - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, + fault_page); mutex_unlock(&kvm->arch.uvmem_lock);
return ret; @@ -632,7 +635,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, pvt->remove_gfn = true;
if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, - PAGE_SHIFT, kvm, pvt->gpa)) + PAGE_SHIFT, kvm, pvt->gpa, NULL)) pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", pvt->gpa, addr); } else { @@ -735,7 +738,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, bool pagein) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *spage; unsigned long pfn; struct page *dpage; @@ -993,7 +996,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
if (kvmppc_svm_page_out(vmf->vma, vmf->address, vmf->address + PAGE_SIZE, PAGE_SHIFT, - pvt->kvm, pvt->gpa)) + pvt->kvm, pvt->gpa, vmf->page)) return VM_FAULT_SIGBUS; else return 0; @@ -1064,7 +1067,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, if (!vma || vma->vm_start > start || vma->vm_end < end) goto out;
- if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) ret = H_SUCCESS; out: mmap_read_unlock(kvm->mm); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0f8d1583fa8e..a9de6d3ae07d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -36,6 +36,9 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l); +extern int migrate_page_extra(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode, int extra_count); extern int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); @@ -190,6 +193,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; };
int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index a85613068d60..58d1e8c41889 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -671,7 +671,7 @@ static int dmirror_migrate(struct dmirror *dmirror, unsigned long src_pfns[64]; unsigned long dst_pfns[64]; struct dmirror_bounce bounce; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret;
@@ -1048,7 +1048,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long src_pfns; unsigned long dst_pfns; struct page *rpage; @@ -1071,6 +1071,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + args.fault_page = vmf->page;
if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index cf49c1e23d0f..dbb0fb9bcf81 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3383,7 +3383,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address); } else if (is_device_private_entry(entry)) { vmf->page = device_private_entry_to_page(entry); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 6cd51f3817b6..ebbc34d7c509 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -689,21 +689,15 @@ EXPORT_SYMBOL(migrate_page_copy); * Migration functions ***********************************************************/
-/* - * Common logic to directly migrate a single LRU page suitable for - * pages that do not use PagePrivate/PagePrivate2. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct address_space *mapping, +int migrate_page_extra(struct address_space *mapping, struct page *newpage, struct page *page, - enum migrate_mode mode) + enum migrate_mode mode, int extra_count) { int rc;
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, extra_count);
if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -714,6 +708,19 @@ int migrate_page(struct address_space *mapping, migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } + +/* + * Common logic to directly migrate a single LRU page suitable for + * pages that do not use PagePrivate/PagePrivate2. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + return migrate_page_extra(mapping, newpage, page, mode, 0); +} EXPORT_SYMBOL(migrate_page);
#ifdef CONFIG_BLOCK @@ -2524,14 +2531,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * migrate_page_move_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page);
/* * FIXME support THP (transparent huge page), it is bit more complex to @@ -2639,7 +2646,7 @@ static void migrate_vma_prepare(struct migrate_vma *migrate) put_page(page); }
- if (!migrate_vma_check_page(page)) { + if (!migrate_vma_check_page(page, migrate->fault_page)) { if (remap) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; migrate->cpages--; @@ -2707,7 +2714,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) goto restore; }
- if (migrate_vma_check_page(page)) + if (migrate_vma_check_page(page, migrate->fault_page)) continue;
restore: @@ -2817,6 +2824,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -3047,7 +3056,12 @@ void migrate_vma_pages(struct migrate_vma *migrate) } }
- r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (migrate->fault_page == page) + r = migrate_page_extra(mapping, newpage, page, + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_page(mapping, newpage, page, + MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; }
From: Alistair Popple apopple@nvidia.com
mainline inclusion from mainline-v6.1-rc2 commit 97061d441110528dc02972818f2f1dad485107f9 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5VZ0L
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Commit 16ce101db85d ("mm/memory.c: fix race when faulting a device private page") changed the migrate_to_ram() callback to take a reference on the device page to ensure it can't be freed while handling the fault. Unfortunately the corresponding update to Nouveau to accommodate this change was inadvertently dropped from that patch causing GPU to CPU migration to fail so add it here.
Link: https://lkml.kernel.org/r/20221019122934.866205-1-apopple@nvidia.com Fixes: 16ce101db85d ("mm/memory.c: fix race when faulting a device private page") Signed-off-by: Alistair Popple apopple@nvidia.com Cc: John Hubbard jhubbard@nvidia.com Cc: Ralph Campbell rcampbell@nvidia.com Cc: Lyude Paul lyude@redhat.com Cc: Ben Skeggs bskeggs@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 92987daa5e17..89d23e812a21 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -192,6 +192,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) .src = &src, .dst = &dst, .pgmap_owner = drm->dev, + .fault_page = vmf->page, .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE, };
From: Ido Schimmel idosch@nvidia.com
stable inclusion from stable-v5.10.138 commit 0e28678a770df7989108327cfe86f835d8760c33 category: bugfix bugzilla: 187888 CVE: CVE-2022-3625
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 6b4db2e528f650c7fb712961aac36455468d5902 upstream.
After a failed devlink reload, devlink parameters are still registered, which means user space can set and get their values. In the case of the mlxsw "acl_region_rehash_interval" parameter, these operations will trigger a use-after-free [1].
Fix this by rejecting set and get operations while in the failed state. Return the "-EOPNOTSUPP" error code which does not abort the parameters dump, but instead causes it to skip over the problematic parameter.
Another possible fix is to perform these checks in the mlxsw parameter callbacks, but other drivers might be affected by the same problem and I am not aware of scenarios where these stricter checks will cause a regression.
[1] mlxsw_spectrum3 0000:00:10.0: Port 125: Failed to register netdev mlxsw_spectrum3 0000:00:10.0: Failed to create ports
================================================================== BUG: KASAN: use-after-free in mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 Read of size 4 at addr ffff8880099dcfd8 by task kworker/u4:4/777
CPU: 1 PID: 777 Comm: kworker/u4:4 Not tainted 5.19.0-rc7-custom-126601-gfe26f28c586d #1 Hardware name: QEMU MSN4700, BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x92/0xbd lib/dump_stack.c:106 print_address_description mm/kasan/report.c:313 [inline] print_report.cold+0x5e/0x5cf mm/kasan/report.c:429 kasan_report+0xb9/0xf0 mm/kasan/report.c:491 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report_generic.c:306 mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 mlxsw_sp_acl_region_rehash_intrvl_get+0x49/0x60 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c:1106 mlxsw_sp_params_acl_region_rehash_intrvl_get+0x33/0x80 drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3854 devlink_param_get net/core/devlink.c:4981 [inline] devlink_nl_param_fill+0x238/0x12d0 net/core/devlink.c:5089 devlink_param_notify+0xe5/0x230 net/core/devlink.c:5168 devlink_ns_change_notify net/core/devlink.c:4417 [inline] devlink_ns_change_notify net/core/devlink.c:4396 [inline] devlink_reload+0x15f/0x700 net/core/devlink.c:4507 devlink_pernet_pre_exit+0x112/0x1d0 net/core/devlink.c:12272 ops_pre_exit_list net/core/net_namespace.c:152 [inline] cleanup_net+0x494/0xc00 net/core/net_namespace.c:582 process_one_work+0x9fc/0x1710 kernel/workqueue.c:2289 worker_thread+0x675/0x10b0 kernel/workqueue.c:2436 kthread+0x30c/0x3d0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 </TASK>
The buggy address belongs to the physical page: page:ffffea0000267700 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x99dc flags: 0x100000000000000(node=0|zone=1) raw: 0100000000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff8880099dce80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dcf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
ffff8880099dcf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
^ ffff8880099dd000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dd080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ==================================================================
Fixes: 98bbf70c1c41 ("mlxsw: spectrum: add "acl_region_rehash_interval" devlink param") Signed-off-by: Ido Schimmel idosch@nvidia.com Reviewed-by: Jiri Pirko jiri@nvidia.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/core/devlink.c b/net/core/devlink.c index 646d90f63daf..72047750dcd9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3620,7 +3620,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get) + if (!param->get || devlink->reload_failed) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -3629,7 +3629,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set) + if (!param->set || devlink->reload_failed) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5YRAC CVE: NA
--------------------------------
This reverts commit 84f7a9de0602704bbec774a6c7f7c8c4994bee9c.
Because it introduces a problem that rq->__data_len is set to the wrong value.
before the patch: 1) nr_bytes = rq->__data_len 2) rq->__data_len = sdp->sector_size 3) scsi_init_io() 4) rq->__data_len = nr_bytes
after the patch: 1) rq->__data_len = sdp->sector_size 2) scsi_init_io() 3) rq->__data_len = rq->__data_len -> __data_len is wrong
It will cause that io can only complete one segment each time, and the io will requeue in scsi_io_completion_action(), which will cause severe performance degradation.
Scsi write same is removed in commit e383e16e84e9 ("scsi: sd: Remove WRITE_SAME support") from mainline, hence this patch is only needed for stable kernels.
Fixes: 84f7a9de0602 ("scsi: sd: Remove a local variable") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Bart Van Assche bvanassche@acm.org Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/sd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 006db49b42de..b63262d7abcf 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1074,6 +1074,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) struct bio *bio = rq->bio; u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); + unsigned int nr_bytes = blk_rq_bytes(rq); blk_status_t ret;
if (sdkp->device->no_write_same) @@ -1110,7 +1111,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) */ rq->__data_len = sdp->sector_size; ret = scsi_alloc_sgtables(cmd); - rq->__data_len = blk_rq_bytes(rq); + rq->__data_len = nr_bytes;
return ret; }
From: David Jeffery djeffery@redhat.com
mainline inclusion from mainline-v5.18-rc1 commit 8f5fea65b06de1cc51d4fc23fb4d378d1abd6ed7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5YREM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?8f5...
--------------------------------
When blk_mq_delay_run_hw_queues sets an hctx to run in the future, it can reset the delay length for an already pending delayed work run_work. This creates a scenario where multiple hctx may have their queues set to run, but if one runs first and finds nothing to do, it can reset the delay of another hctx and stall the other hctx's ability to run requests.
To avoid this I/O stall when an hctx's run_work is already pending, leave it untouched to run at its current designated time rather than extending its delay. The work will still run which keeps closed the race calling blk_mq_delay_run_hw_queues is needed for while also avoiding the I/O stall.
Signed-off-by: David Jeffery djeffery@redhat.com Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20220131203337.GA17666@redhat Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/block/blk-mq.c b/block/blk-mq.c index eb24773f127d..484f65d0b1e2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1830,6 +1830,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; + /* + * If there is already a run_work pending, leave the + * pending delay untouched. Otherwise, a hctx can stall + * if another hctx is re-delaying the other's work + * before the work executes. + */ + if (delayed_work_pending(&hctx->run_work)) + continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the
From: Tadeusz Struk tadeusz.struk@linaro.org
stable inclusion from stable-v5.10.148 commit 1b257f97fec43d7a8a4c9ada8538d14421861b0a category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5XTU4 CVE: CVE-2022-43750
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
commit a659daf63d16aa883be42f3f34ff84235c302198 upstream.
Syzbot found an issue in usbmon module, where the user space client can corrupt the monitor's internal memory, causing the usbmon module to crash the kernel with segfault, UAF, etc.
The reproducer mmaps the /dev/usbmon memory to user space, and overwrites it with arbitrary data, which causes all kinds of issues.
Return an -EPERM error from mon_bin_mmap() if the flag VM_WRTIE is set. Also clear VM_MAYWRITE to make it impossible to change it to writable later.
Cc: "Dmitry Vyukov" dvyukov@google.com Cc: stable stable@kernel.org Fixes: 6f23ee1fefdc ("USB: add binary API to usbmon") Suggested-by: PaX Team pageexec@freemail.hu # for the VM_MAYRITE portion Link: https://syzkaller.appspot.com/bug?id=2eb1f35d6525fa4a74d75b4244971e5b1411c95... Reported-by: syzbot+23f57c5ae902429285d7@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk tadeusz.struk@linaro.org Link: https://lore.kernel.org/r/20220919215957.205681-1-tadeusz.struk@linaro.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Zhao Wenhui zhaowenhui8@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/usb/mon/mon_bin.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index f48a23adbc35..094e812e9e69 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1268,6 +1268,11 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) { /* don't do anything here: "fault" will set up page table entries */ vma->vm_ops = &mon_bin_vm_ops; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + + vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma);
From: Baolin Wang baolin.wang@linux.alibaba.com
mainline inclusion from mainline-v6.1-rc1 commit fac35ba763ed07ba93154c95ffc0c4a55023707f category: bugfix bugzilla: 187864, https://gitee.com/src-openeuler/kernel/issues/I5X1Z9 CVE: CVE-2022-3623
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=...
--------------------------------
On some architectures (like ARM64), it can support CONT-PTE/PMD size hugetlb, which means it can support not only PMD/PUD size hugetlb (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size specified.
So when looking up a CONT-PTE size hugetlb page by follow_page(), it will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE size hugetlb in follow_page_pte(). However this pte entry lock is incorrect for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to get the correct lock, which is mm->page_table_lock.
That means the pte entry of the CONT-PTE size hugetlb under current pte lock is unstable in follow_page_pte(), we can continue to migrate or poison the pte entry of the CONT-PTE size hugetlb, which can cause some potential race issues, even though they are under the 'pte lock'.
For example, suppose thread A is trying to look up a CONT-PTE size hugetlb page by move_pages() syscall under the lock, however antoher thread B can migrate the CONT-PTE hugetlb page at the same time, which will cause thread A to get an incorrect page, if thread A also wants to do page migration, then data inconsistency error occurs.
Moreover we have the same issue for CONT-PMD size hugetlb in follow_huge_pmd().
To fix above issues, rename the follow_huge_pmd() as follow_huge_pmd_pte() to handle PMD and PTE level size hugetlb, which uses huge_pte_lock() to get the correct pte entry lock to make the pte entry stable.
Mike said:
Support for CONT_PMD/_PTE was added with bb9dd3df8ee9 ("arm64: hugetlb: refactor find_num_contig()"). Patch series "Support for contiguous pte hugepages", v4. However, I do not believe these code paths were executed until migration support was added with 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") I would go with 5480280d3f2d for the Fixes: targe.
Link: https://lkml.kernel.org/r/635f43bdd85ac2615a58405da82b4d33c6e5eb05.166201756... Fixes: 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") Signed-off-by: Baolin Wang baolin.wang@linux.alibaba.com Suggested-by: Mike Kravetz mike.kravetz@oracle.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: David Hildenbrand david@redhat.com Cc: Muchun Song songmuchun@bytedance.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: mm/hugetlb.c Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/hugetlb.h | 8 ++++---- mm/gup.c | 14 +++++++++++++- mm/hugetlb.c | 27 +++++++++++++-------------- 3 files changed, 30 insertions(+), 19 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0dfe08439095..d691071f3ecd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -196,8 +196,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pd(struct vm_area_struct *vma, unsigned long address, hugepd_t hpd, int flags, int pdshift); -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags); +struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, + int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags); struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, @@ -283,8 +283,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma, return NULL; }
-static inline struct page *follow_huge_pmd(struct mm_struct *mm, - unsigned long address, pmd_t *pmd, int flags) +static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, + unsigned long address, int flags) { return NULL; } diff --git a/mm/gup.c b/mm/gup.c index 4e9945299fe5..606b4a61e6ba 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -486,6 +486,18 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); + + /* + * Considering PTE level hugetlb, like continuous-PTE hugetlb on + * ARM64 architecture. + */ + if (is_vm_hugetlb_page(vma)) { + page = follow_huge_pmd_pte(vma, address, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -641,7 +653,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (pmd_none(pmdval)) return no_page_table(vma, flags); if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd(mm, address, pmd, flags); + page = follow_huge_pmd_pte(vma, address, flags); if (page) return page; return no_page_table(vma, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c3b73c0b7e7e..1367b7459bb2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6008,12 +6008,13 @@ follow_huge_pd(struct vm_area_struct *vma, }
struct page * __weak -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags) +follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) { + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; spinlock_t *ptl; - pte_t pte; + pte_t *ptep, pte;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == @@ -6021,17 +6022,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL;
retry: - ptl = pmd_lockptr(mm, pmd); - spin_lock(ptl); - /* - * make sure that the address range covered by this pmd is not - * unmapped from other threads. - */ - if (!pmd_huge(*pmd)) - goto out; - pte = huge_ptep_get((pte_t *)pmd); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); + if (!ptep) + return NULL; + + ptl = huge_pte_lock(h, mm, ptep); + pte = huge_ptep_get(ptep); if (pte_present(pte)) { - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pte_page(pte) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); /* * try_grab_page() should always succeed here, because: a) we * hold the pmd (ptl) lock, and b) we've just checked that the @@ -6047,7 +6046,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait(mm, ptep, ptl); goto retry; } /*
From: Ryusuke Konishi konishi.ryusuke@gmail.com
mainline inclusion from mainline-v6.0-rc3 commit 21a87d88c2253350e115029f14fe2a10a7e6c856 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5X1Z4 CVE: CVE-2022-3621
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If the i_mode field in inode of metadata files is corrupted on disk, it can cause the initialization of bmap structure, which should have been called from nilfs_read_inode_common(), not to be called. This causes a lockdep warning followed by a NULL pointer dereference at nilfs_bmap_lookup_at_level().
This patch fixes these issues by adding a missing sanitiy check for the i_mode field of metadata file's inode.
Link: https://lkml.kernel.org/r/20221002030804.29978-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi konishi.ryusuke@gmail.com Reported-by: syzbot+2b32eb36c1a825b7a74c@syzkaller.appspotmail.com Reported-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Tested-by: Ryusuke Konishi konishi.ryusuke@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/nilfs2/inode.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 3e4874b0c55c..fb594edc0837 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -459,6 +459,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) + return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) return -ESTALE; /* this inode is deleted */
From: Dokyung Song dokyung.song@gmail.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5YGD6 CVE: CVE-2022-3628
Reference: https://patchwork.kernel.org/project/linux-wireless/patch/20221021061359.GA5...
--------------------------------
This patch fixes an intra-object buffer overflow in brcmfmac that occurs when the device provides a 'bsscfgidx' equal to or greater than the buffer size. The patch adds a check that leads to a safe failure if that is the case.
This fixes CVE-2022-3628.
UBSAN: array-index-out-of-bounds in drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c index 52 is out of range for type 'brcmf_if *[16]' CPU: 0 PID: 1898 Comm: kworker/0:2 Tainted: G O 5.14.0+ #132 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Workqueue: events brcmf_fweh_event_worker Call Trace: dump_stack_lvl+0x57/0x7d ubsan_epilogue+0x5/0x40 __ubsan_handle_out_of_bounds+0x69/0x80 ? memcpy+0x39/0x60 brcmf_fweh_event_worker+0xae1/0xc00 ? brcmf_fweh_call_event_handler.isra.0+0x100/0x100 ? rcu_read_lock_sched_held+0xa1/0xd0 ? rcu_read_lock_bh_held+0xb0/0xb0 ? lockdep_hardirqs_on_prepare+0x273/0x3e0 process_one_work+0x873/0x13e0 ? lock_release+0x640/0x640 ? pwq_dec_nr_in_flight+0x320/0x320 ? rwlock_bug.part.0+0x90/0x90 worker_thread+0x8b/0xd10 ? __kthread_parkme+0xd9/0x1d0 ? process_one_work+0x13e0/0x13e0 kthread+0x379/0x450 ? _raw_spin_unlock_irq+0x24/0x30 ? set_kthread_struct+0x100/0x100 ret_from_fork+0x1f/0x30
================================================================================ general protection fault, probably for non-canonical address 0xe5601c0020023fff: 0000 [#1] SMP KASAN KASAN: maybe wild-memory-access in range [0x2b0100010011fff8-0x2b0100010011ffff] CPU: 0 PID: 1898 Comm: kworker/0:2 Tainted: G O 5.14.0+ #132 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Workqueue: events brcmf_fweh_event_worker RIP: 0010:brcmf_fweh_call_event_handler.isra.0+0x42/0x100 Code: 89 f5 53 48 89 fb 48 83 ec 08 e8 79 0b 38 fe 48 85 ed 74 7e e8 6f 0b 38 fe 48 89 ea 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 8b 00 00 00 4c 8b 7d 00 44 89 e0 48 ba 00 00 00 RSP: 0018:ffffc9000259fbd8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: ffff888115d8cd50 RCX: 0000000000000000 RDX: 0560200020023fff RSI: ffffffff8304bc91 RDI: ffff888115d8cd50 RBP: 2b0100010011ffff R08: ffff888112340050 R09: ffffed1023549809 R10: ffff88811aa4c047 R11: ffffed1023549808 R12: 0000000000000045 R13: ffffc9000259fca0 R14: ffff888112340050 R15: ffff888112340000 FS: 0000000000000000(0000) GS:ffff88811aa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000004053ccc0 CR3: 0000000112740000 CR4: 0000000000750ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: brcmf_fweh_event_worker+0x117/0xc00 ? brcmf_fweh_call_event_handler.isra.0+0x100/0x100 ? rcu_read_lock_sched_held+0xa1/0xd0 ? rcu_read_lock_bh_held+0xb0/0xb0 ? lockdep_hardirqs_on_prepare+0x273/0x3e0 process_one_work+0x873/0x13e0 ? lock_release+0x640/0x640 ? pwq_dec_nr_in_flight+0x320/0x320 ? rwlock_bug.part.0+0x90/0x90 worker_thread+0x8b/0xd10 ? __kthread_parkme+0xd9/0x1d0 ? process_one_work+0x13e0/0x13e0 kthread+0x379/0x450 ? _raw_spin_unlock_irq+0x24/0x30 ? set_kthread_struct+0x100/0x100 ret_from_fork+0x1f/0x30 Modules linked in: 88XXau(O) 88x2bu(O) ---[ end trace 41d302138f3ff55a ]--- RIP: 0010:brcmf_fweh_call_event_handler.isra.0+0x42/0x100 Code: 89 f5 53 48 89 fb 48 83 ec 08 e8 79 0b 38 fe 48 85 ed 74 7e e8 6f 0b 38 fe 48 89 ea 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 8b 00 00 00 4c 8b 7d 00 44 89 e0 48 ba 00 00 00 RSP: 0018:ffffc9000259fbd8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: ffff888115d8cd50 RCX: 0000000000000000 RDX: 0560200020023fff RSI: ffffffff8304bc91 RDI: ffff888115d8cd50 RBP: 2b0100010011ffff R08: ffff888112340050 R09: ffffed1023549809 R10: ffff88811aa4c047 R11: ffffed1023549808 R12: 0000000000000045 R13: ffffc9000259fca0 R14: ffff888112340050 R15: ffff888112340000 FS: 0000000000000000(0000) GS:ffff88811aa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000004053ccc0 CR3: 0000000112740000 CR4: 0000000000750ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Kernel panic - not syncing: Fatal exception
Reported-by: Dokyung Song dokyungs@yonsei.ac.kr Reported-by: Jisoo Jang jisoo.jang@yonsei.ac.kr Reported-by: Minsuk Kang linuxlovemin@yonsei.ac.kr Reviewed-by: Arend van Spriel aspriel@gmail.com Signed-off-by: Dokyung Song dokyung.song@gmail.com Signed-off-by: Liu Jian liujian56@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c index 430d2cca98b3..1285d3685c4f 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c @@ -228,6 +228,10 @@ static void brcmf_fweh_event_worker(struct work_struct *work) brcmf_fweh_event_name(event->code), event->code, event->emsg.ifidx, event->emsg.bsscfgidx, event->emsg.addr); + if (event->emsg.bsscfgidx >= BRCMF_MAX_IFS) { + bphy_err(drvr, "invalid bsscfg index: %u\n", event->emsg.bsscfgidx); + goto event_free; + }
/* convert event message */ emsg_be = &event->emsg;
From: Dongliang Mu mudongliangabcd@gmail.com
mainline inclusion from mainline-v6.1-rc1 commit 2e488f13755ffbb60f307e991b27024716a33b29 category: bugfix bugzilla: 187543, https://gitee.com/src-openeuler/kernel/issues/I5NZ98 CVE: CVE-2022-2978
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
-------------------------------
In alloc_inode, inode_init_always() could return -ENOMEM if security_inode_alloc() fails, which causes inode->i_private uninitialized. Then nilfs_is_metadata_file_inode() returns true and nilfs_free_inode() wrongly calls nilfs_mdt_destroy(), which frees the uninitialized inode->i_private and leads to crashes(e.g., UAF/GPF).
Fix this by moving security_inode_alloc just prior to this_cpu_inc(nr_inodes)
Link: https://lkml.kernel.org/r/CAFcO6XOcf1Jj2SeGt=jJV59wmhESeSKpfR0omdFRq+J9nD1vf... Reported-by: butt3rflyh4ck butterflyhuangxx@gmail.com Reported-by: Hao Sun sunhao.th@gmail.com Reported-by: Jiacheng Xu stitch@zju.edu.cn Reviewed-by: Christian Brauner (Microsoft) brauner@kernel.org Signed-off-by: Dongliang Mu mudongliangabcd@gmail.com Cc: Al Viro viro@zeniv.linux.org.uk Cc: stable@vger.kernel.org Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c index 82090bfadb07..7436a17a20c1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -168,8 +168,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_wb_frn_history = 0; #endif
- if (security_inode_alloc(inode)) - goto out; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -202,11 +200,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; + + if (unlikely(security_inode_alloc(inode))) + return -ENOMEM; this_cpu_inc(nr_inodes);
return 0; -out: - return -ENOMEM; } EXPORT_SYMBOL(inode_init_always);
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v5.10.138 commit a0ae122e9aeccbff75014c4d36d11a9d32e7fb5e category: bugfix bugzilla: 187909, https://gitee.com/src-openeuler/kernel/issues/I5X3ML CVE: CVE-2022-3635
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 3f4093e2bf4673f218c0bf17d8362337c400e77b upstream.
There are use-after-free bugs caused by tst_timer. The root cause is that there are no functions to stop tst_timer in idt77252_exit(). One of the possible race conditions is shown below:
(thread 1) | (thread 2) | idt77252_init_one | init_card | fill_tst | mod_timer(&card->tst_timer, ...) idt77252_exit | (wait a time) | tst_timer | | ... kfree(card) // FREE | | card->soft_tst[e] // USE
The idt77252_dev is deallocated in idt77252_exit() and used in timer handler.
This patch adds del_timer_sync() in idt77252_exit() in order that the timer handler could be stopped before the idt77252_dev is deallocated.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Duoming Zhou duoming@zju.edu.cn Link: https://lore.kernel.org/r/20220805070008.18007-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Zhang Changzhong zhangchangzhong@huawei.com Reviewed-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/atm/idt77252.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 5f0472c18bcb..82f6f1fbe9e7 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -3767,6 +3767,7 @@ static void __exit idt77252_exit(void) card = idt77252_chain; dev = card->atmdev; idt77252_chain = card->next; + del_timer_sync(&card->tst_timer);
if (dev->phy->stop) dev->phy->stop(dev);
From: Kefeng Wang wangkefeng.wang@huawei.com
hulk inclusion category: bugfix bugzilla: 187483, https://gitee.com/openeuler/kernel/issues/I5MH9N CVE: NA
--------------------------------
When support memmap=nn[KMG]$ss[KMG], it will request resource to show reserved memory in iomem, but with memblock_setclr_flag() called memblock_mark_memmap() leads to split memblock region and then request_resource() could return -EBUSY when passed unaligned address.
Let's directly use memblock_reserve() and drop membloc_setclr_flag() and show error if request_resource() in request_memmap_resource() return error code.
Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/setup.c | 37 +++++++++++++++++++++++++++---------- arch/arm64/mm/init.c | 30 +++++++++++++++--------------- arch/arm64/mm/internal.h | 4 ++++ 3 files changed, 46 insertions(+), 25 deletions(-)
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index ddca8d27fca6..940e237009e9 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -219,17 +219,34 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) static void __init request_memmap_resources(struct resource *res) { struct resource *memmap_res; + phys_addr_t base, size; + int i; + + for (i = 0; i < MAX_RES_REGIONS; i++) { + base = mbk_memmap_regions[i].base; + size = mbk_memmap_regions[i].size; + if (!size) + continue; + + if ((base < res->start) || (base + size - 1 > res->end)) + continue;
- memmap_res = memblock_alloc(sizeof(*memmap_res), SMP_CACHE_BYTES); - if (!memmap_res) - panic("%s: Failed to allocate memmap_res\n", __func__); + memmap_res = memblock_alloc(sizeof(*memmap_res), SMP_CACHE_BYTES); + if (!memmap_res) + panic("%s: Failed to allocate memmap_res\n", __func__);
- memmap_res->name = "memmap reserved"; - memmap_res->flags = IORESOURCE_MEM; - memmap_res->start = res->start; - memmap_res->end = res->end; + memmap_res->name = "memmap reserved"; + memmap_res->flags = IORESOURCE_MEM; + memmap_res->start = base; + memmap_res->end = base + size - 1;
- request_resource(res, memmap_res); + if (request_resource(res, memmap_res)) { + pr_warn("memmap reserve: [%llx, %llx] request resource fail\n", + memmap_res->start, memmap_res->end); + memblock_free_early(virt_to_phys(memmap_res), + sizeof(*memmap_res)); + } + } }
static void __init request_standard_resources(void) @@ -270,8 +287,8 @@ static void __init request_standard_resources(void) if (kernel_data.start >= res->start && kernel_data.end <= res->end) request_resource(res, &kernel_data); - if (memblock_is_memmap(region)) - request_memmap_resources(res); + + request_memmap_resources(res);
#ifdef CONFIG_KEXEC_CORE /* Userspace will find "Crash kernel" region in /proc/iomem. */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5023c7e1f754..7ef7c0e7ee7c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -297,10 +297,8 @@ static void __init fdt_enforce_memory_region(void) memblock_add(usable_rgns[1].base, usable_rgns[1].size); }
-#define MAX_RES_REGIONS 32 - -static struct memblock_region mbk_memmap_regions[MAX_RES_REGIONS] __initdata_memblock; -static int mbk_memmap_cnt __initdata; +struct memblock_region mbk_memmap_regions[MAX_RES_REGIONS] __initdata_memblock; +int mbk_memmap_cnt __initdata;
static void __init setup_mbk_memmap_regions(phys_addr_t base, phys_addr_t size) { @@ -317,6 +315,7 @@ static void __init setup_mbk_memmap_regions(phys_addr_t base, phys_addr_t size) static void __init reserve_memmap_regions(void) { phys_addr_t base, size; + const char *str; int i;
for (i = 0; i < mbk_memmap_cnt; i++) { @@ -324,26 +323,27 @@ static void __init reserve_memmap_regions(void) size = mbk_memmap_regions[i].size;
if (!memblock_is_region_memory(base, size)) { - pr_warn("memmap reserve: 0x%08llx - 0x%08llx is not a memory region - ignore\n", - base, base + size); - continue; + str = "is not a memory region - ignore"; + goto err; }
if (memblock_is_region_reserved(base, size)) { - pr_warn("memmap reserve: 0x%08llx - 0x%08llx overlaps in-use memory region - ignore\n", - base, base + size); - continue; + str = "overlaps in-use memory region - ignore"; + goto err; }
if (memblock_reserve(base, size)) { - pr_warn("memmap reserve: 0x%08llx - 0x%08llx failed\n", - base, base + size); - continue; + str = "failed"; + goto err; }
pr_info("memmap reserved: 0x%08llx - 0x%08llx (%lld MB)", - base, base + size, size >> 20); - memblock_mark_memmap(base, size); + base, base + size - 1, size >> 20); + continue; +err: + mbk_memmap_regions[i].size = 0; + pr_warn("memmap reserve: 0x%08llx - 0x%08llx %s\n", + base, base + size - 1, str); } }
diff --git a/arch/arm64/mm/internal.h b/arch/arm64/mm/internal.h index 9b8e20d87172..2af98d110668 100644 --- a/arch/arm64/mm/internal.h +++ b/arch/arm64/mm/internal.h @@ -22,4 +22,8 @@ static inline void __init reserve_quick_kexec(void) {} static inline void __init request_quick_kexec_res(struct resource *res) {} #endif
+#define MAX_RES_REGIONS 32 +extern struct memblock_region mbk_memmap_regions[MAX_RES_REGIONS]; +extern int mbk_memmap_cnt; + #endif /* ifndef _ARM64_MM_INTERNAL_H */
From: Liu Shixin liushixin2@huawei.com
maillist inclusion category: bugfix bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5NX1S
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?...
--------------------------------
The vmemmap pages is marked by kmemleak when allocated from memblock. Remove it from kmemleak when freeing the page. Otherwise, when we reuse the page, kmemleak may report such an error and then stop working.
kmemleak: Cannot insert 0xffff98fb6eab3d40 into the object search tree (overlaps existing) kmemleak: Kernel memory leak detector disabled kmemleak: Object 0xffff98fb6be00000 (size 335544320): kmemleak: comm "swapper", pid 0, jiffies 4294892296 kmemleak: min_count = 0 kmemleak: count = 0 kmemleak: flags = 0x1 kmemleak: checksum = 0 kmemleak: backtrace:
Link: https://lkml.kernel.org/r/20220819094005.2928241-1-liushixin2@huawei.com Fixes: f41f2ed43ca5 (mm: hugetlb: free the vmemmap pages associated with each HugeTLB page) Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Muchun Song songmuchun@bytedance.com Cc: Matthew Wilcox willy@infradead.org Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/bootmem_info.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 5b152dba7344..12dffa051fbd 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -12,6 +12,7 @@ #include <linux/memblock.h> #include <linux/bootmem_info.h> #include <linux/memory_hotplug.h> +#include <linux/kmemleak.h>
void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) { @@ -34,6 +35,7 @@ void put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); + kmemleak_free_part(page_to_virt(page), PAGE_SIZE); free_reserved_page(page); } }
From: Jiangshui Yang yangjiangshui@h-partners.com
yangjiangshui inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5XYPG CVE: NA
----------------------------------------------------------------------
The sm4-ccm and sm4-gcm algorithms of the sec module depend on the fallback TFM function. Therefore, the sm4_generic module of the crypto subsystem needs to be loaded.
Signed-off-by: Jiangshui Yang yangjiangshui@h-partners.com Reviewed-by: Kai Ye yekai13@huawei.com Reviewed-by: Chao Liu liuchao173@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9f16277a83d6..3d7e0e44358d 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6725,6 +6725,7 @@ CONFIG_CRYPTO_CHACHA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_SM4_GENERIC=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_TWOFISH_COMMON=m
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Z2HJ
----------------------------------------------------------
Currently, we have dumped some fields in the QP/CQ/MR resource. This information is not enough. It is very inconvenient to continue to expand on the current field, and it will also introduce some trouble to parse this raw data.
This patch dump whole resource in raw to avoid the above problems.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_restrack.c | 73 +------------------ 1 file changed, 3 insertions(+), 70 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index 989a2af2e938..961036b31c18 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -47,8 +47,6 @@ int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq) struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); struct hns_roce_v2_cq_context context; - u32 data[MAX_ENTRY_NUM] = {}; - int offset = 0; int ret;
if (!hr_dev->hw->query_cqc) @@ -58,23 +56,7 @@ int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq) if (ret) return -EINVAL;
- data[offset++] = hr_reg_read(&context, CQC_CQ_ST); - data[offset++] = hr_reg_read(&context, CQC_SHIFT); - data[offset++] = hr_reg_read(&context, CQC_CQE_SIZE); - data[offset++] = hr_reg_read(&context, CQC_CQE_CNT); - data[offset++] = hr_reg_read(&context, CQC_CQ_PRODUCER_IDX); - data[offset++] = hr_reg_read(&context, CQC_CQ_CONSUMER_IDX); - data[offset++] = hr_reg_read(&context, CQC_DB_RECORD_EN); - data[offset++] = hr_reg_read(&context, CQC_ARM_ST); - data[offset++] = hr_reg_read(&context, CQC_CMD_SN); - data[offset++] = hr_reg_read(&context, CQC_CEQN); - data[offset++] = hr_reg_read(&context, CQC_CQ_MAX_CNT); - data[offset++] = hr_reg_read(&context, CQC_CQ_PERIOD); - data[offset++] = hr_reg_read(&context, CQC_CQE_HOP_NUM); - data[offset++] = hr_reg_read(&context, CQC_CQE_BAR_PG_SZ); - data[offset++] = hr_reg_read(&context, CQC_CQE_BUF_PG_SZ); - - ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, offset * sizeof(u32), data); + ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context);
return ret; } @@ -118,8 +100,6 @@ int hns_roce_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ib_qp) struct hns_roce_dev *hr_dev = to_hr_dev(ib_qp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ib_qp); struct hns_roce_v2_qp_context context; - u32 data[MAX_ENTRY_NUM] = {}; - int offset = 0; int ret;
if (!hr_dev->hw->query_qpc) @@ -129,42 +109,7 @@ int hns_roce_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ib_qp) if (ret) return -EINVAL;
- data[offset++] = hr_reg_read(&context, QPC_QP_ST); - data[offset++] = hr_reg_read(&context, QPC_ERR_TYPE); - data[offset++] = hr_reg_read(&context, QPC_CHECK_FLG); - data[offset++] = hr_reg_read(&context, QPC_SRQ_EN); - data[offset++] = hr_reg_read(&context, QPC_SRQN); - data[offset++] = hr_reg_read(&context, QPC_QKEY_XRCD); - data[offset++] = hr_reg_read(&context, QPC_TX_CQN); - data[offset++] = hr_reg_read(&context, QPC_RX_CQN); - data[offset++] = hr_reg_read(&context, QPC_SQ_PRODUCER_IDX); - data[offset++] = hr_reg_read(&context, QPC_SQ_CONSUMER_IDX); - data[offset++] = hr_reg_read(&context, QPC_RQ_RECORD_EN); - data[offset++] = hr_reg_read(&context, QPC_RQ_PRODUCER_IDX); - data[offset++] = hr_reg_read(&context, QPC_RQ_CONSUMER_IDX); - data[offset++] = hr_reg_read(&context, QPC_SQ_SHIFT); - data[offset++] = hr_reg_read(&context, QPC_RQWS); - data[offset++] = hr_reg_read(&context, QPC_RQ_SHIFT); - data[offset++] = hr_reg_read(&context, QPC_SGE_SHIFT); - data[offset++] = hr_reg_read(&context, QPC_SQ_HOP_NUM); - data[offset++] = hr_reg_read(&context, QPC_RQ_HOP_NUM); - data[offset++] = hr_reg_read(&context, QPC_SGE_HOP_NUM); - data[offset++] = hr_reg_read(&context, QPC_WQE_SGE_BA_PG_SZ); - data[offset++] = hr_reg_read(&context, QPC_WQE_SGE_BUF_PG_SZ); - data[offset++] = hr_reg_read(&context, QPC_RETRY_NUM_INIT); - data[offset++] = hr_reg_read(&context, QPC_RETRY_CNT); - data[offset++] = hr_reg_read(&context, QPC_SQ_CUR_PSN); - data[offset++] = hr_reg_read(&context, QPC_SQ_MAX_PSN); - data[offset++] = hr_reg_read(&context, QPC_SQ_FLUSH_IDX); - data[offset++] = hr_reg_read(&context, QPC_SQ_MAX_IDX); - data[offset++] = hr_reg_read(&context, QPC_SQ_TX_ERR); - data[offset++] = hr_reg_read(&context, QPC_SQ_RX_ERR); - data[offset++] = hr_reg_read(&context, QPC_RQ_RX_ERR); - data[offset++] = hr_reg_read(&context, QPC_RQ_TX_ERR); - data[offset++] = hr_reg_read(&context, QPC_RQ_CQE_IDX); - data[offset++] = hr_reg_read(&context, QPC_RQ_RTY_TX_ERR); - - ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, offset * sizeof(u32), data); + ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context);
return ret; } @@ -204,8 +149,6 @@ int hns_roce_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr) struct hns_roce_dev *hr_dev = to_hr_dev(ib_mr->device); struct hns_roce_mr *hr_mr = to_hr_mr(ib_mr); struct hns_roce_v2_mpt_entry context; - u32 data[MAX_ENTRY_NUM] = {}; - int offset = 0; int ret;
if (!hr_dev->hw->query_mpt) @@ -215,17 +158,7 @@ int hns_roce_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr) if (ret) return -EINVAL;
- data[offset++] = hr_reg_read(&context, MPT_ST); - data[offset++] = hr_reg_read(&context, MPT_PD); - data[offset++] = hr_reg_read(&context, MPT_LKEY); - data[offset++] = hr_reg_read(&context, MPT_LEN_L); - data[offset++] = hr_reg_read(&context, MPT_LEN_H); - data[offset++] = hr_reg_read(&context, MPT_PBL_SIZE); - data[offset++] = hr_reg_read(&context, MPT_PBL_HOP_NUM); - data[offset++] = hr_reg_read(&context, MPT_PBL_BA_PG_SZ); - data[offset++] = hr_reg_read(&context, MPT_PBL_BUF_PG_SZ); - - ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, offset * sizeof(u32), data); + ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context);
return ret; }
From: Lang Cheng chenglang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Z2DS
----------------------------------------------------------
Driver can notify ulp with IB event when net link down/up.
Signed-off-by: Lang Cheng chenglang@huawei.com Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 12 ++++++ drivers/infiniband/hw/hns/hns_roce_main.c | 45 +++++++++++++++------ 2 files changed, 45 insertions(+), 12 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 7a945cefd5a0..51e59084f875 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -640,6 +640,7 @@ struct hns_roce_ib_iboe { struct net_device *netdevs[HNS_ROCE_MAX_PORTS]; struct notifier_block nb; u8 phy_port[HNS_ROCE_MAX_PORTS]; + enum ib_port_state port_state[HNS_ROCE_MAX_PORTS]; };
struct hns_roce_ceqe { @@ -1095,6 +1096,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh) grh->traffic_class >> DSCP_SHIFT : grh->traffic_class; }
+static inline u8 to_rdma_port_num(u8 phy_port_num) +{ + return phy_port_num + 1; +} + +static inline enum ib_port_state get_port_state(struct net_device *net_dev) +{ + return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; +} + void hns_roce_init_uar_table(struct hns_roce_dev *dev); int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar);
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 4fb8c4488195..cdb6def7923e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -89,10 +89,13 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) }
static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port, - unsigned long event) + unsigned long dev_event) { struct device *dev = hr_dev->dev; + enum ib_port_state port_state; struct net_device *netdev; + struct ib_event event; + unsigned long flags; int ret = 0;
netdev = hr_dev->iboe.netdevs[port]; @@ -101,20 +104,38 @@ static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port, return -ENODEV; }
- switch (event) { - case NETDEV_UP: - case NETDEV_CHANGE: + switch (dev_event) { case NETDEV_REGISTER: case NETDEV_CHANGEADDR: ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); break; + case NETDEV_UP: + case NETDEV_CHANGE: + ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); + if (ret) + return ret; + fallthrough; case NETDEV_DOWN: - /* - * In v1 engine, only support all ports closed together. - */ + port_state = get_port_state(netdev); + + spin_lock_irqsave(&hr_dev->iboe.lock, flags); + if (hr_dev->iboe.port_state[port] == port_state) { + spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); + return NOTIFY_DONE; + } + hr_dev->iboe.port_state[port] = port_state; + spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); + + event.device = &hr_dev->ib_dev; + event.event = (port_state == IB_PORT_ACTIVE) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + event.element.port_num = to_rdma_port_num(port); + ib_dispatch_event(&event); + break; + case NETDEV_UNREGISTER: break; default: - dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event)); + dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(dev_event)); break; }
@@ -151,6 +172,8 @@ static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev) u8 i;
for (i = 0; i < hr_dev->caps.num_ports; i++) { + hr_dev->iboe.port_state[i] = IB_PORT_DOWN; + ret = hns_roce_set_mac(hr_dev, i, hr_dev->iboe.netdevs[i]->dev_addr); if (ret) @@ -245,9 +268,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
mtu = iboe_get_mtu(net_dev->mtu); props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256; - props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ? - IB_PORT_ACTIVE : - IB_PORT_DOWN; + props->state = get_port_state(net_dev); props->phys_state = props->state == IB_PORT_ACTIVE ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; @@ -439,7 +460,7 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) phys_addr_t pfn; pgprot_t prot; int ret; - + rdma_entry = rdma_user_mmap_entry_get_pgoff(uctx, vma->vm_pgoff); if (!rdma_entry) return -EINVAL;
From: Lang Cheng chenglang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5Z2DS
----------------------------------------------------------
When the netdev port status changes, the roce driver sends a port down event by parsing the netdev event dispatched by IB_CORE, which takes about a few hundred milliseconds. But,it is not fast enough for ULP sometimes.
The HNS NIC driver can directly notify the ROCE driver send port event via callback function, this takes only a few milliseconds. This patch implements this callback function.
Signed-off-by: Lang Cheng chenglang@huawei.com Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 450b7ac4333c..bd45d07619e9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7046,9 +7046,43 @@ static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, return ret; }
+static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, + bool linkup) +{ + struct net_device *netdev = handle->rinfo.netdev; + struct hns_roce_dev *hr_dev = handle->priv; + struct ib_event event; + unsigned long flags; + u8 phy_port; + + if (linkup || !hr_dev) + return; + + for (phy_port = 0; phy_port < hr_dev->caps.num_ports; phy_port++) + if (netdev == hr_dev->iboe.netdevs[phy_port]) + break; + + if (phy_port == hr_dev->caps.num_ports) + return; + + spin_lock_irqsave(&hr_dev->iboe.lock, flags); + if (hr_dev->iboe.port_state[phy_port] == IB_PORT_DOWN) { + spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); + return; + } + hr_dev->iboe.port_state[phy_port] = IB_PORT_DOWN; + spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); + + event.device = &hr_dev->ib_dev; + event.element.port_num = to_rdma_port_num(phy_port); + event.event = IB_EVENT_PORT_ERR; + ib_dispatch_event(&event); +} + static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, + .link_status_change = hns_roce_hw_v2_link_status_change, .reset_notify = hns_roce_hw_v2_reset_notify, };
From: Yixing Liu liuyixing1@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5YZ0Q
----------------------------------------------------------
When the driver does not have an ex_cmd flag, related ioctl() or syscall() will fail to execute. for example, if the modify_qp flag is missing, the following error will appear when running perftest: Failed to modify QP to INIT, ret=95 Failed to modify QP to INIT Failed to modify QP 24 to RTR Failed to modify QP 25 to RTR Unable to Connect the HCA's through the link Unable to Connect the HCA's through the link
So add query_device、create_cq、modify_qp and create_qp flag.
Fixes: df0651079380 ("RDMA/hns: Enable modify_cq for uverbs.") Signed-off-by: Yixing Liu liuyixing1@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index cdb6def7923e..9f30e242d23a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -654,7 +654,12 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) (1ULL << IB_USER_VERBS_CMD_CREATE_AH) | (1ULL << IB_USER_VERBS_CMD_DESTROY_AH);
- ib_dev->uverbs_ex_cmd_mask |= (1ULL << IB_USER_VERBS_EX_CMD_MODIFY_CQ); + ib_dev->uverbs_ex_cmd_mask |= + (1ULL << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | + (1ULL << IB_USER_VERBS_EX_CMD_MODIFY_CQ) | + (1ULL << IB_USER_VERBS_EX_CMD_CREATE_CQ) | + (1ULL << IB_USER_VERBS_EX_CMD_MODIFY_QP) | + (1ULL << IB_USER_VERBS_EX_CMD_CREATE_QP);
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_REREG_MR) { ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR);
From: Xiang Chen chenxiang66@hisilicon.com
mainline inclusion from mainline-v5.19-rc7 commit 1e82e4627a795 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5M9GC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently if a phy reset or enable phy is issued via sysfs when controller is suspended, those operations will be ignored as SAS_HA_REGISTERED is cleared. If RPM is enabled then we may aggressively suspend automatically. In this case it may be difficult to enable or reset a phy via sysfs, so resume the host in these scenarios.
Link: https://lore.kernel.org/r/1657823002-139010-6-git-send-email-john.garry@huaw... Signed-off-by: Xiang Chen chenxiang66@hisilicon.com Signed-off-by: John Garry john.garry@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: xiabing xiabing12@h-partners.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/libsas/sas_init.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/scsi/libsas/sas_init.c b/drivers/scsi/libsas/sas_init.c index f1989a98f511..58ffcecf1a2f 100644 --- a/drivers/scsi/libsas/sas_init.c +++ b/drivers/scsi/libsas/sas_init.c @@ -528,6 +528,7 @@ static int queue_phy_reset(struct sas_phy *phy, int hard_reset) if (!d) return -ENOMEM;
+ pm_runtime_get_sync(ha->dev); /* libsas workqueue coordinates ata-eh reset with discovery */ mutex_lock(&d->event_lock); d->reset_result = 0; @@ -541,6 +542,7 @@ static int queue_phy_reset(struct sas_phy *phy, int hard_reset) if (rc == 0) rc = d->reset_result; mutex_unlock(&d->event_lock); + pm_runtime_put_sync(ha->dev);
return rc; } @@ -555,6 +557,7 @@ static int queue_phy_enable(struct sas_phy *phy, int enable) if (!d) return -ENOMEM;
+ pm_runtime_get_sync(ha->dev); /* libsas workqueue coordinates ata-eh reset with discovery */ mutex_lock(&d->event_lock); d->enable_result = 0; @@ -568,6 +571,7 @@ static int queue_phy_enable(struct sas_phy *phy, int enable) if (rc == 0) rc = d->enable_result; mutex_unlock(&d->event_lock); + pm_runtime_put_sync(ha->dev);
return rc; }