From: Yu Kuai yukuai3@huawei.com
mainline inclusion from md-next commit 7fdc91928ac109d3d1468ad7f951deb29a375e3d category: performance bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5PRMO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/song/md.git/commit/?h=md-nex...
--------------------------------
Currently, wake_up() is called unconditionally in fast path such as raid10_make_request(), which will cause lock contention under high concurrency:
raid10_make_request wake_up __wake_up_common_lock spin_lock_irqsave
Improve performance by only call wake_up() if waitqueue is not empty.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Logan Gunthorpe logang@deltatee.com Acked-by: Guoqing Jiang guoqing.jiang@linux.dev Signed-off-by: Song Liu song@kernel.org Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- drivers/md/raid10.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5049c4d829e5..13e0b4a462fc 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -305,6 +305,12 @@ static void put_buf(struct r10bio *r10_bio) lower_barrier(conf); }
+static void wake_up_barrier(struct r10conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + static void reschedule_retry(struct r10bio *r10_bio) { unsigned long flags; @@ -1025,7 +1031,7 @@ static void allow_barrier(struct r10conf *conf) { if ((atomic_dec_and_test(&conf->nr_pending)) || (conf->array_freeze_pending)) - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); }
static void freeze_array(struct r10conf *conf, int extra) @@ -1584,7 +1590,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) __make_request(mddev, bio, sectors);
/* In case raid10d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); return true; }
From: Yu Kuai yukuai3@huawei.com
mainline inclusion from md-next commit ddc489e066cd267b383c0eed4f576f6bdb154588 category: performance bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5PRMO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/song/md.git/commit/?h=md-nex...
---------------------
Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier', and io can't be dispatched until 'barrier' is dropped.
Since holding the 'barrier' is not common, convert 'resync_lock' to use seqlock so that holding lock can be avoided in fast path.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-and-tested-by: Logan Gunthorpe logang@deltatee.com Signed-off-by: Song Liu song@kernel.org Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- drivers/md/raid10.c | 88 ++++++++++++++++++++++++++++++--------------- drivers/md/raid10.h | 2 +- 2 files changed, 61 insertions(+), 29 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 13e0b4a462fc..f8fea9593955 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -112,6 +112,21 @@ static void end_reshape(struct r10conf *conf);
#include "raid1-10.c"
+#define NULL_CMD +#define cmd_before(conf, cmd) \ + do { \ + write_sequnlock_irq(&(conf)->resync_lock); \ + cmd; \ + } while (0) +#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) + +#define wait_event_barrier_cmd(conf, cond, cmd) \ + wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ + cmd_after(conf)) + +#define wait_event_barrier(conf, cond) \ + wait_event_barrier_cmd(conf, cond, NULL_CMD) + /* * for resync bio, r10bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -971,35 +986,54 @@ static void flush_pending_writes(struct r10conf *conf) static void raise_barrier(struct r10conf *conf, int force) { BUG_ON(force && !conf->barrier); - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock);
/* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock); + wait_event_barrier(conf, force || !conf->nr_waiting);
/* block any new IO from starting */ - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, - conf->resync_lock); + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH);
- spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static void lower_barrier(struct r10conf *conf) { unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + + write_seqlock_irqsave(&conf->resync_lock, flags); + WRITE_ONCE(conf->barrier, conf->barrier - 1); + write_sequnlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); }
+static bool wait_barrier_nolock(struct r10conf *conf) +{ + unsigned int seq = read_seqbegin(&conf->resync_lock); + + if (READ_ONCE(conf->barrier)) + return false; + + atomic_inc(&conf->nr_pending); + if (!read_seqretry(&conf->resync_lock, seq)) + return true; + + if (atomic_dec_and_test(&conf->nr_pending)) + wake_up_barrier(conf); + + return false; +} + static void wait_barrier(struct r10conf *conf) { - spin_lock_irq(&conf->resync_lock); + if (wait_barrier_nolock(conf)) + return; + + write_seqlock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; /* Wait for the barrier to drop. @@ -1012,19 +1046,18 @@ static void wait_barrier(struct r10conf *conf) * count down. */ raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, + wait_event_barrier(conf, !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))), - conf->resync_lock); + !bio_list_empty(¤t->bio_list[1])))); conf->nr_waiting--; if (!conf->nr_waiting) wake_up(&conf->wait_barrier); } atomic_inc(&conf->nr_pending); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static void allow_barrier(struct r10conf *conf) @@ -1048,27 +1081,26 @@ static void freeze_array(struct r10conf *conf, int extra) * must match the number of pending IOs (nr_pending) before * we continue. */ - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock); conf->array_freeze_pending++; - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); + wait_event_barrier_cmd(conf, + atomic_read(&conf->nr_pending) == conf->nr_queued+extra, + flush_pending_writes(conf));
conf->array_freeze_pending--; - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static void unfreeze_array(struct r10conf *conf) { /* reverse the effect of the freeze */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; + write_seqlock_irq(&conf->resync_lock); + WRITE_ONCE(conf->barrier, conf->barrier - 1); conf->nr_waiting--; wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static sector_t choose_data_offset(struct r10bio *r10_bio, @@ -3740,7 +3772,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list);
- spin_lock_init(&conf->resync_lock); + seqlock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); atomic_set(&conf->nr_pending, 0);
@@ -4080,7 +4112,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - conf->barrier = 1; + WRITE_ONCE(conf->barrier, 1); }
return conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index d3eaaf3eb1bc..e368a92f37fd 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -77,7 +77,7 @@ struct r10conf { struct bio_list pending_bio_list; int pending_count;
- spinlock_t resync_lock; + seqlock_t resync_lock; atomic_t nr_pending; int nr_waiting; int nr_queued;
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
Add a new flag QUEUE_FLAG_DISPATCH_ASYNC and two new fields 'dispatch_cpumask' and 'last_dispatch_cpu' for request_queue, prepare to support dispatch bio asynchronous in specified cpus. This patch also add sysfs apis.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-core.c | 28 +++++++++++++++++++++++++++- block/blk-sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++ block/blk.h | 2 ++ include/linux/blkdev.h | 6 ++++++ 4 files changed, 74 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index e98827d25ef8..bffecc437fbc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -85,6 +85,27 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue;
+static int blk_alloc_queue_dispatch_async(struct request_queue *q) +{ + int cpu; + + q->last_dispatch_cpu = alloc_percpu(int); + if (!q->last_dispatch_cpu) + return -ENOMEM; + + cpumask_setall(&q->dispatch_async_cpus); + for_each_possible_cpu(cpu) { + *per_cpu_ptr(q->last_dispatch_cpu, cpu) = cpu; + } + + return 0; +} + +void blk_free_queue_dispatch_async(struct request_queue *q) +{ + free_percpu(q->last_dispatch_cpu); +} + /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set @@ -1049,9 +1070,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, q->end_sector = 0; q->boundary_rq = NULL;
+ if (blk_alloc_queue_dispatch_async(q)) + goto fail_q; + q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_q; + goto fail_dispatch_async;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (ret) @@ -1130,6 +1154,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_dispatch_async: + blk_free_queue_dispatch_async(q); fail_q: kmem_cache_free(blk_requestq_cachep, q_wrapper); return NULL; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 30898a7855d7..60daf9b53a97 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -696,6 +696,42 @@ static struct queue_sysfs_entry queue_wb_lat_entry = { .store = queue_wb_lat_store, };
+static ssize_t queue_dispatch_async_cpus_show(struct request_queue *q, + char *page) +{ + int cpu; + ssize_t ret = 0; + + if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags)) + return -EOPNOTSUPP; + + for_each_cpu(cpu, &q->dispatch_async_cpus) { + ret += sprintf(page + ret, "%d ", cpu); + } + + ret += sprintf(page + ret, "\n"); + return ret; +} + +static struct queue_sysfs_entry queue_dispatch_async_cpus_entry = { + .attr = {.name = "dispatch_async_cpus", .mode = 0444 }, + .show = queue_dispatch_async_cpus_show, +}; + +static ssize_t queue_show_dispatch_async(struct request_queue *q, + char *page) +{ + if (test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags)) + return sprintf(page, "1\n"); + else + return sprintf(page, "0\n"); +} + +static struct queue_sysfs_entry queue_dispatch_async_entry = { + .attr = {.name = "dispatch_async", .mode = 0444 }, + .show = queue_show_dispatch_async, +}; + #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static struct queue_sysfs_entry throtl_sample_time_entry = { .attr = {.name = "throttle_sample_time", .mode = 0644 }, @@ -738,6 +774,8 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, + &queue_dispatch_async_cpus_entry.attr, + &queue_dispatch_async_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif @@ -819,6 +857,7 @@ static void __blk_release_queue(struct work_struct *work) if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); + blk_free_queue_dispatch_async(q);
if (!blk_queue_dead(q)) { /* diff --git a/block/blk.h b/block/blk.h index dde2141a32dd..f3094e18f89e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -460,4 +460,6 @@ extern int blk_iolatency_init(struct request_queue *q); static inline int blk_iolatency_init(struct request_queue *q) { return 0; } #endif
+extern void blk_free_queue_dispatch_async(struct request_queue *q); + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1deaf36eb237..fd1fc4670f31 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -701,6 +701,10 @@ struct request_queue {
struct work_struct release_work;
+ /* used when QUEUE_FLAG_DISPATCH_ASYNC is set */ + struct cpumask dispatch_async_cpus; + int __percpu *last_dispatch_cpu; + #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; @@ -739,6 +743,8 @@ struct request_queue { #define QUEUE_FLAG_FORECE_QUIESCE 29 /* force quiesce when cleanup queue */ /* queue has bee quiesced, used in block layer */ #define QUEUE_FLAG_QUIESCED_INTERNAL 30 +/* bio will be dispatched asynchronous */ +#define QUEUE_FLAG_DISPATCH_ASYNC 31
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
In some architecture memory access latency is very bad across nodes compare to local node. For consequence, io performance is rather bad while users issue io from multiple nodes if lock contention exist in the driver.
This patch make io dispatch asynchronously to specific kthread that is bind to cpus that are belong to the same node, so that memory access across nodes in driver can be avoided.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-core.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 187 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index bffecc437fbc..bed4fbfbd337 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -85,6 +85,31 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue;
+#define BIO_DISPATCH_MAX_LOOP 16 + +/* prevent false sharing */ +#define BIO_ASYNC_LIST_SHIFT 2 +#define BIO_ASYNC_LOCK_SHIFT 4 +#define bio_async_list(ctl, i) (&(ctl)->list[(i) << BIO_ASYNC_LIST_SHIFT]) +#define bio_async_lock(ctl, i) (&(ctl)->lock[(i) << BIO_ASYNC_LOCK_SHIFT]) + +struct bio_dispatch_async_ctl { + /* + * Vector size is nr_cpu_ids, list stores bio dispatched from other cpu, + * such bio will be dispatched asynchronously to the cpu this structure + * is serviced. + */ + struct bio_list *list; + /* list is protected by lock */ + spinlock_t *lock; + /* kthread to dispatch bio asynchronously */ + struct task_struct *thread; + /* thread will wait here if there are no bios in list */ + wait_queue_head_t wait; +}; + +static struct bio_dispatch_async_ctl __percpu **bio_dispatch_async_ctl; + static int blk_alloc_queue_dispatch_async(struct request_queue *q) { int cpu; @@ -106,6 +131,165 @@ void blk_free_queue_dispatch_async(struct request_queue *q) free_percpu(q->last_dispatch_cpu); }
+static int collect_bio(struct bio_dispatch_async_ctl *ctl, + struct bio_list *list) +{ + int count = 0; + int cpu; + struct bio *bio; + + for_each_possible_cpu(cpu) { + spin_lock_irq(bio_async_lock(ctl, cpu)); + while ((bio = bio_list_pop(bio_async_list(ctl, cpu)))) { + bio_list_add(list, bio); + count++; + } + spin_unlock_irq(bio_async_lock(ctl, cpu)); + } + + return count; +} + +static int bio_dispatch_work(void *data) +{ + int loop_count = 0; + int cpu = smp_processor_id(); + struct bio_dispatch_async_ctl *ctl = + *per_cpu_ptr(bio_dispatch_async_ctl, cpu); + + for (;; loop_count++) { + struct bio_list bio_list_on_stack; + struct blk_plug plug; + struct bio *bio; + int count; + + bio_list_init(&bio_list_on_stack); + count = collect_bio(ctl, &bio_list_on_stack); + + if (!count) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&ctl->wait, &wait, + TASK_INTERRUPTIBLE); + count = collect_bio(ctl, &bio_list_on_stack); + if (count) + break; + schedule(); + loop_count = 0; + } + finish_wait(&ctl->wait, &wait); + + } + + blk_start_plug(&plug); + while ((bio = bio_list_pop(&bio_list_on_stack))) { + struct request_queue *q = bio->bi_disk->queue; + + q->make_request_fn(q, bio); + } + blk_finish_plug(&plug); + + /* prevent soft lockup */ + if (loop_count >= BIO_DISPATCH_MAX_LOOP) { + loop_count = 0; + cond_resched(); + } + } + + return 0; +} + +static int get_dispatch_cpu(struct request_queue *q, int cpu) +{ + int *last_dispatch_cpu = per_cpu_ptr(q->last_dispatch_cpu, cpu); + + cpu = cpumask_next(*last_dispatch_cpu, &q->dispatch_async_cpus); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(&q->dispatch_async_cpus); + + *last_dispatch_cpu = cpu; + + return cpu; +} + +static void blk_queue_make_request_async(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + int cpu = smp_processor_id(); + int dispatch_cpu = get_dispatch_cpu(q, cpu); + struct bio_dispatch_async_ctl *ctl = + *per_cpu_ptr(bio_dispatch_async_ctl, dispatch_cpu); + + spin_lock_irq(bio_async_lock(ctl, cpu)); + bio_list_add(bio_async_list(ctl, cpu), bio); + spin_unlock_irq(bio_async_lock(ctl, cpu)); + + if (wq_has_sleeper(&ctl->wait)) + wake_up(&ctl->wait); +} + +static blk_qc_t blk_queue_do_make_request(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + int cpu = smp_processor_id(); + + /* + * Don't dispatch bio asynchronously in following cases: + * + * 1) QUEUE_FLAG_DISPATCH_ASYNC is not set; + * 2) current cpu is the target cpu; + * 3) bio is flagged no wait; + * 4) TODO: return value of submit_bio() will be used in io polling. + */ + if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags) || + cpumask_test_cpu(cpu, &q->dispatch_async_cpus) || + bio->bi_opf & REQ_NOWAIT) + return q->make_request_fn(q, bio); + + /* return value is not concerned */ + blk_queue_make_request_async(bio); + return BLK_QC_T_NONE; +} + +static void init_blk_queue_async_dispatch(void) +{ + int cpu; + + bio_dispatch_async_ctl = alloc_percpu(struct bio_dispatch_async_ctl *); + if (!bio_dispatch_async_ctl) + panic("Failed to alloc bio_dispatch_async_ctl\n"); + + for_each_possible_cpu(cpu) { + int i; + struct bio_dispatch_async_ctl *ctl = + kmalloc(sizeof(struct bio_dispatch_async_ctl), + GFP_KERNEL | __GFP_NOFAIL); + + *per_cpu_ptr(bio_dispatch_async_ctl, cpu) = ctl; + + ctl->thread = + kthread_create_on_cpu(bio_dispatch_work, NULL, cpu, + "bio_dispatch_work_%u"); + if (IS_ERR_OR_NULL(ctl->thread)) + panic("Failed to create bio dispatch thread\n"); + + ctl->list = kmalloc_array(nr_cpu_ids, + sizeof(struct bio_list) << BIO_ASYNC_LIST_SHIFT, + GFP_KERNEL | __GFP_NOFAIL); + ctl->lock = kmalloc_array(nr_cpu_ids, + sizeof(spinlock_t) << BIO_ASYNC_LOCK_SHIFT, + GFP_KERNEL | __GFP_NOFAIL); + for (i = 0; i < nr_cpu_ids; ++i) { + bio_list_init(bio_async_list(ctl, i)); + spin_lock_init(bio_async_lock(ctl, i)); + } + + wake_up_process(ctl->thread); + init_waitqueue_head(&ctl->wait); + } +} + /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set @@ -2514,7 +2698,7 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = q->make_request_fn(q, bio); + ret = blk_queue_do_make_request(bio);
/* sort new bios into those for a lower level * and those for the same level @@ -4054,6 +4238,8 @@ int __init blk_dev_init(void) sizeof(struct request_queue_wrapper), 0, SLAB_PANIC, NULL);
+ init_blk_queue_async_dispatch(); + #ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); #endif
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
For some architectures, masking the underlying processor topology differences can make software unable to identify the cpu distance, which results in performance fluctuations.
So we provide additional interface for getting preferred sibling's cpumask supported by platform, this siblings' cpumask indicates those CPUs which are clustered with relatively short distances, but this hardly depends on the specific implementation of the specific platform.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- arch/arm64/Kconfig | 15 +++++++++ arch/arm64/include/asm/smp_plat.h | 14 +++++++++ arch/arm64/kernel/smp.c | 9 ++++++ arch/arm64/kernel/topology.c | 52 +++++++++++++++++++++++++++++++ include/linux/arch_topology.h | 12 +++++++ 5 files changed, 102 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 80ab9c9dd43c..0ad6ce436355 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1389,6 +1389,21 @@ config RANDOMIZE_MODULE_REGION_FULL a limited range that contains the [_stext, _etext] interval of the core kernel, so branch relocations are always in range.
+config ARCH_GET_PREFERRED_SIBLING_CPUMASK + bool "Get preferred sibling cpumask from mpidr" + depends on ARM64 + default n + help + For some architectures, masking the underlying processor topology + differences can make software unable to identify the cpu distance, + which results in performance fluctuations. + + So we provide additional interface for getting preferred sibling's + cpumask supported by platform, this siblings' cpumask indicates those + CPUs which are clustered with relatively short distances, NOTE this + hardly depends on the specific implementation of the specific platform. + + menuconfig ASCEND_FEATURES bool "Support Ascend Features" depends on ARM64 diff --git a/arch/arm64/include/asm/smp_plat.h b/arch/arm64/include/asm/smp_plat.h index af58dcdefb21..63e29335f426 100644 --- a/arch/arm64/include/asm/smp_plat.h +++ b/arch/arm64/include/asm/smp_plat.h @@ -56,4 +56,18 @@ static inline int get_logical_index(u64 mpidr) return -EINVAL; }
+#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK +void update_mpidr_siblings_masks(unsigned int cpu, bool remove); + +static inline void mpidr_siblings_add_cpu(unsigned int cpu) +{ + update_mpidr_siblings_masks(cpu, false); +} + +static inline void mpidr_siblings_remove_cpu(unsigned int cpu) +{ + update_mpidr_siblings_masks(cpu, true); +} +#endif + #endif /* __ASM_SMP_PLAT_H */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index cdb81a36be85..6b8bc313a87b 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -426,6 +426,9 @@ asmlinkage notrace void secondary_start_kernel(void)
store_cpu_topology(cpu); numa_add_cpu(cpu); +#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK + mpidr_siblings_add_cpu(cpu); +#endif
/* * OK, now it's safe to let the boot CPU continue. Wait for @@ -481,6 +484,9 @@ int __cpu_disable(void)
remove_cpu_topology(cpu); numa_remove_cpu(cpu); +#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK + mpidr_siblings_remove_cpu(cpu); +#endif
/* * Take this CPU offline. Once we clear this, we can't return, @@ -945,6 +951,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) store_cpu_topology(this_cpu); numa_store_cpu_info(this_cpu); numa_add_cpu(this_cpu); +#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK + mpidr_siblings_add_cpu(this_cpu); +#endif
/* * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 2646695e2f2a..bf937d334b81 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -348,6 +348,58 @@ void remove_cpu_topology(unsigned int cpu) clear_cpu_topology(cpu); }
+#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK +#define MAX_MPIDR_SIBLINGS 100 +static struct cpumask mpidr_siblings_cpumask_map[MAX_MPIDR_SIBLINGS]; + +static void +__update_mpidr_siblings_masks(unsigned int cpu, int sibling, bool remove) +{ + if (WARN_ON_ONCE(sibling < 0 || sibling >= MAX_MPIDR_SIBLINGS)) + return; + + if (remove) + cpumask_clear_cpu(cpu, &mpidr_siblings_cpumask_map[sibling]); + else + cpumask_set_cpu(cpu, &mpidr_siblings_cpumask_map[sibling]); +} + +void update_mpidr_siblings_masks(unsigned int cpu, bool remove) +{ + int sibling, affinity; + u32 midr_impl = MIDR_IMPLEMENTOR(read_cpuid_id()); + u64 mpidr = read_cpuid_mpidr(); + bool mt = mpidr & MPIDR_MT_BITMASK; + + switch (midr_impl) { + case ARM_CPU_IMP_HISI: + if (mt && read_cpuid_part_number() == HISI_CPU_PART_TSV110) { + affinity = MPIDR_AFFINITY_LEVEL(mpidr, 2); + sibling = ((affinity >> 3) - 1) / 2; + __update_mpidr_siblings_masks(cpu, sibling, remove); + } + break; + default: + break; + } +} + +void arch_get_preferred_sibling_cpumask(unsigned int sibling, + cpumask_var_t dstp) +{ + if (!dstp) + return; + + if (sibling >= MAX_MPIDR_SIBLINGS) { + cpumask_clear(dstp); + return; + } + + cpumask_copy(dstp, &mpidr_siblings_cpumask_map[sibling]); +} +EXPORT_SYMBOL(arch_get_preferred_sibling_cpumask); +#endif + #ifdef CONFIG_ACPI static bool __init acpi_cpu_is_threaded(int cpu) { diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 2b709416de05..80c28bfce557 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -32,4 +32,16 @@ unsigned long topology_get_freq_scale(int cpu) return per_cpu(freq_scale, cpu); }
+#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK +void arch_get_preferred_sibling_cpumask(unsigned int sibling, + cpumask_var_t dstp); +#else +static inline void +arch_get_preferred_sibling_cpumask(unsigned int sibling, cpumask_var_t dstp) +{ + if (dstp) + cpumask_clear(dstp); +} +#endif + #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: 187597, https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
Try to improve performance for raid when user issues io concurrently from multiple nodes.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- drivers/md/md.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/drivers/md/md.c b/drivers/md/md.c index 9c94190769b8..ac2bdb4665b8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -66,6 +66,7 @@ #include <linux/raid/md_u.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> +#include <linux/arch_topology.h>
#include <trace/events/block.h> #include "md.h" @@ -5543,6 +5544,16 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
+#define MIN_DISPATCH_ASYNC_CPUS 16 +static void queue_init_dispatch_async_cpus(struct request_queue *q, int node) +{ + arch_get_preferred_sibling_cpumask(node, &q->dispatch_async_cpus); + if (cpumask_weight(&q->dispatch_async_cpus) >= MIN_DISPATCH_ASYNC_CPUS) + blk_queue_flag_set(QUEUE_FLAG_DISPATCH_ASYNC, q); + else + cpumask_setall(&q->dispatch_async_cpus); +} + int md_run(struct mddev *mddev) { int err; @@ -5786,6 +5797,8 @@ int md_run(struct mddev *mddev) if (mddev->sb_flags) md_update_sb(mddev, 0);
+ if (mddev->queue && pers->level == 10) + queue_init_dispatch_async_cpus(mddev->queue, 1); md_new_event(mddev); return 0;
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: 187597, https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
request_queue_wrapper is not accessible in drivers currently, introduce a new helper to initialize async dispatch to fix kabi broken.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-core.c | 38 +++++++++++++++++++++++++++++--------- block/blk-sysfs.c | 2 +- block/blk.h | 4 ++++ drivers/md/md.c | 11 ----------- include/linux/blkdev.h | 5 +---- 5 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index bed4fbfbd337..08e7a875b229 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -35,6 +35,7 @@ #include <linux/blk-cgroup.h> #include <linux/debugfs.h> #include <linux/bpf.h> +#include <linux/arch_topology.h>
#define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -86,6 +87,8 @@ struct kmem_cache *blk_requestq_cachep; static struct workqueue_struct *kblockd_workqueue;
#define BIO_DISPATCH_MAX_LOOP 16 +/* the minimum of cpus that dispatch async can be enabled */ +#define MIN_DISPATCH_ASYNC_CPUS 16
/* prevent false sharing */ #define BIO_ASYNC_LIST_SHIFT 2 @@ -112,15 +115,16 @@ static struct bio_dispatch_async_ctl __percpu **bio_dispatch_async_ctl;
static int blk_alloc_queue_dispatch_async(struct request_queue *q) { + struct request_queue_wrapper *q_wrapper = queue_to_wrapper(q); int cpu;
- q->last_dispatch_cpu = alloc_percpu(int); - if (!q->last_dispatch_cpu) + q_wrapper->last_dispatch_cpu = alloc_percpu(int); + if (!q_wrapper->last_dispatch_cpu) return -ENOMEM;
- cpumask_setall(&q->dispatch_async_cpus); + cpumask_setall(&q_wrapper->dispatch_async_cpus); for_each_possible_cpu(cpu) { - *per_cpu_ptr(q->last_dispatch_cpu, cpu) = cpu; + *per_cpu_ptr(q_wrapper->last_dispatch_cpu, cpu) = cpu; }
return 0; @@ -128,7 +132,7 @@ static int blk_alloc_queue_dispatch_async(struct request_queue *q)
void blk_free_queue_dispatch_async(struct request_queue *q) { - free_percpu(q->last_dispatch_cpu); + free_percpu(queue_to_wrapper(q)->last_dispatch_cpu); }
static int collect_bio(struct bio_dispatch_async_ctl *ctl, @@ -202,11 +206,14 @@ static int bio_dispatch_work(void *data)
static int get_dispatch_cpu(struct request_queue *q, int cpu) { - int *last_dispatch_cpu = per_cpu_ptr(q->last_dispatch_cpu, cpu); + int *last_dispatch_cpu = + per_cpu_ptr(queue_to_wrapper(q)->last_dispatch_cpu, cpu); + struct cpumask *dispatch_async_cpus = + &queue_to_wrapper(q)->dispatch_async_cpus;
- cpu = cpumask_next(*last_dispatch_cpu, &q->dispatch_async_cpus); + cpu = cpumask_next(*last_dispatch_cpu, dispatch_async_cpus); if (cpu >= nr_cpu_ids) - cpu = cpumask_first(&q->dispatch_async_cpus); + cpu = cpumask_first(dispatch_async_cpus);
*last_dispatch_cpu = cpu;
@@ -243,7 +250,7 @@ static blk_qc_t blk_queue_do_make_request(struct bio *bio) * 4) TODO: return value of submit_bio() will be used in io polling. */ if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags) || - cpumask_test_cpu(cpu, &q->dispatch_async_cpus) || + cpumask_test_cpu(cpu, &queue_to_wrapper(q)->dispatch_async_cpus) || bio->bi_opf & REQ_NOWAIT) return q->make_request_fn(q, bio);
@@ -290,6 +297,19 @@ static void init_blk_queue_async_dispatch(void) } }
+void queue_init_dispatch_async_cpus(struct request_queue *q, int node) +{ + struct cpumask *dispatch_async_cpus = + &queue_to_wrapper(q)->dispatch_async_cpus; + + arch_get_preferred_sibling_cpumask(node, dispatch_async_cpus); + if (cpumask_weight(dispatch_async_cpus) >= MIN_DISPATCH_ASYNC_CPUS) + blk_queue_flag_set(QUEUE_FLAG_DISPATCH_ASYNC, q); + else + cpumask_setall(dispatch_async_cpus); +} +EXPORT_SYMBOL_GPL(queue_init_dispatch_async_cpus); + /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 60daf9b53a97..cbd7de22c463 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -705,7 +705,7 @@ static ssize_t queue_dispatch_async_cpus_show(struct request_queue *q, if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags)) return -EOPNOTSUPP;
- for_each_cpu(cpu, &q->dispatch_async_cpus) { + for_each_cpu(cpu, &queue_to_wrapper(q)->dispatch_async_cpus) { ret += sprintf(page + ret, "%d ", cpu); }
diff --git a/block/blk.h b/block/blk.h index f3094e18f89e..cb1c9057d788 100644 --- a/block/blk.h +++ b/block/blk.h @@ -48,6 +48,10 @@ struct request_queue_wrapper { */ struct mutex mq_freeze_lock; int mq_freeze_depth; + + /* used when QUEUE_FLAG_DISPATCH_ASYNC is set */ + struct cpumask dispatch_async_cpus; + int __percpu *last_dispatch_cpu; };
#define queue_to_wrapper(q) \ diff --git a/drivers/md/md.c b/drivers/md/md.c index ac2bdb4665b8..2f99f47d5e61 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -66,7 +66,6 @@ #include <linux/raid/md_u.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> -#include <linux/arch_topology.h>
#include <trace/events/block.h> #include "md.h" @@ -5544,16 +5543,6 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
-#define MIN_DISPATCH_ASYNC_CPUS 16 -static void queue_init_dispatch_async_cpus(struct request_queue *q, int node) -{ - arch_get_preferred_sibling_cpumask(node, &q->dispatch_async_cpus); - if (cpumask_weight(&q->dispatch_async_cpus) >= MIN_DISPATCH_ASYNC_CPUS) - blk_queue_flag_set(QUEUE_FLAG_DISPATCH_ASYNC, q); - else - cpumask_setall(&q->dispatch_async_cpus); -} - int md_run(struct mddev *mddev) { int err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fd1fc4670f31..996e00575042 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -701,10 +701,6 @@ struct request_queue {
struct work_struct release_work;
- /* used when QUEUE_FLAG_DISPATCH_ASYNC is set */ - struct cpumask dispatch_async_cpus; - int __percpu *last_dispatch_cpu; - #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; @@ -789,6 +785,7 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); +extern void queue_init_dispatch_async_cpus(struct request_queue *q, int node);
static inline int queue_in_flight(struct request_queue *q) {
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: 187597, https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
If CONFIG_BLK_BIO_DISPATCH_ASYNC is enabled, and driver support QUEUE_FLAG_DISPATCH_ASYNC, bios will be dispatched asynchronously to specific CPUs to avoid across nodes memory access in driver.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/Kconfig | 13 +++++++++++++ block/blk-core.c | 21 ++++++++++++++++++++- block/blk-sysfs.c | 4 ++++ block/blk.h | 8 ++++++++ include/linux/blkdev.h | 7 +++++++ 5 files changed, 52 insertions(+), 1 deletion(-)
diff --git a/block/Kconfig b/block/Kconfig index 1f2469a0123c..da71e56f8682 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -200,6 +200,19 @@ config BLK_SED_OPAL Enabling this option enables users to setup/unlock/lock Locking ranges for SED devices using the Opal protocol.
+config BLK_BIO_DISPATCH_ASYNC + bool "Dispatch bios asynchronously on specific cpus" + default n + depends on BLOCK=y + help + If there are multiple nodes, memory access across nodes is rather bad + compare to local node. And if some drivers are using internal spin + locks, io performance will be bad if bios are issued concurrently from + different nodes. This feature will dispatch bio asynchronously to the + specific CPUs to avoid across nodes memory access in driver, noted this + feature will require special care in the driver to work. If unsure, + say N here. + menu "Partition Types"
source "block/partitions/Kconfig" diff --git a/block/blk-core.c b/block/blk-core.c index 08e7a875b229..2ce4ad4619c1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -35,7 +35,6 @@ #include <linux/blk-cgroup.h> #include <linux/debugfs.h> #include <linux/bpf.h> -#include <linux/arch_topology.h>
#define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -86,6 +85,9 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue;
+#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC +#include <linux/arch_topology.h> + #define BIO_DISPATCH_MAX_LOOP 16 /* the minimum of cpus that dispatch async can be enabled */ #define MIN_DISPATCH_ASYNC_CPUS 16 @@ -309,6 +311,23 @@ void queue_init_dispatch_async_cpus(struct request_queue *q, int node) cpumask_setall(dispatch_async_cpus); } EXPORT_SYMBOL_GPL(queue_init_dispatch_async_cpus); +#else +static int blk_alloc_queue_dispatch_async(struct request_queue *q) +{ + return 0; +} + +static blk_qc_t blk_queue_do_make_request(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + + return q->make_request_fn(q, bio); +} + +static void init_blk_queue_async_dispatch(void) +{ +} +#endif
/** * blk_queue_flag_set - atomically set a queue flag diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cbd7de22c463..5bdae7c8fa27 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -696,6 +696,7 @@ static struct queue_sysfs_entry queue_wb_lat_entry = { .store = queue_wb_lat_store, };
+#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC static ssize_t queue_dispatch_async_cpus_show(struct request_queue *q, char *page) { @@ -731,6 +732,7 @@ static struct queue_sysfs_entry queue_dispatch_async_entry = { .attr = {.name = "dispatch_async", .mode = 0444 }, .show = queue_show_dispatch_async, }; +#endif
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static struct queue_sysfs_entry throtl_sample_time_entry = { @@ -774,8 +776,10 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, +#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC &queue_dispatch_async_cpus_entry.attr, &queue_dispatch_async_entry.attr, +#endif #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif diff --git a/block/blk.h b/block/blk.h index cb1c9057d788..985e20980aff 100644 --- a/block/blk.h +++ b/block/blk.h @@ -49,9 +49,11 @@ struct request_queue_wrapper { struct mutex mq_freeze_lock; int mq_freeze_depth;
+#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC /* used when QUEUE_FLAG_DISPATCH_ASYNC is set */ struct cpumask dispatch_async_cpus; int __percpu *last_dispatch_cpu; +#endif };
#define queue_to_wrapper(q) \ @@ -464,6 +466,12 @@ extern int blk_iolatency_init(struct request_queue *q); static inline int blk_iolatency_init(struct request_queue *q) { return 0; } #endif
+#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC extern void blk_free_queue_dispatch_async(struct request_queue *q); +#else +static inline void blk_free_queue_dispatch_async(struct request_queue *q) +{ +} +#endif
#endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 996e00575042..41d235ee579a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -785,7 +785,14 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); +#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC extern void queue_init_dispatch_async_cpus(struct request_queue *q, int node); +#else +static inline void queue_init_dispatch_async_cpus(struct request_queue *q, + int node) +{ +} +#endif
static inline int queue_in_flight(struct request_queue *q) {
From: Sasha Levin sashal@kernel.org
stable inclusion from stable-v5.4.215 commit 4051324a6dafd7053c74c475e80b3ba10ae672b0 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5T9C3 CVE: CVE-2022-3303
---------------------------
[ Upstream commit 8423f0b6d513b259fdab9c9bf4aaa6188d054c2d ]
There is a small race window at snd_pcm_oss_sync() that is called from OSS PCM SNDCTL_DSP_SYNC ioctl; namely the function calls snd_pcm_oss_make_ready() at first, then takes the params_lock mutex for the rest. When the stream is set up again by another thread between them, it leads to inconsistency, and may result in unexpected results such as NULL dereference of OSS buffer as a fuzzer spotted recently.
The fix is simply to cover snd_pcm_oss_make_ready() call into the same params_lock mutex with snd_pcm_oss_make_ready_locked() variant.
Reported-and-tested-by: butt3rflyh4ck butterflyhuangxx@gmail.com Reviewed-by: Jaroslav Kysela perex@perex.cz Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/CAFcO6XN7JDM4xSXGhtusQfS2mSBcx50VJKwQpCq=WeLt57aaZ... Link: https://lore.kernel.org/r/20220905060714.22549-1-tiwai@suse.de Signed-off-by: Takashi Iwai tiwai@suse.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Xia Longlong xialonglong1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- sound/core/oss/pcm_oss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 41abb8bd466a..29c5f572ca3b 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1653,13 +1653,14 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) goto __direct; - if ((err = snd_pcm_oss_make_ready(substream)) < 0) - return err; atomic_inc(&runtime->oss.rw_ref); if (mutex_lock_interruptible(&runtime->oss.params_lock)) { atomic_dec(&runtime->oss.rw_ref); return -ERESTARTSYS; } + err = snd_pcm_oss_make_ready_locked(substream); + if (err < 0) + goto unlock; format = snd_pcm_oss_format_from(runtime->oss.format); width = snd_pcm_format_physical_width(format); if (runtime->oss.buffer_used > 0) {
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5TY3L CVE: NA
--------------------------------
A crash as follows:
BUG: unable to handle page fault for address: 000000011241cec7 sd 5:0:0:1: [sdl] Synchronizing SCSI cache #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 2465367 Comm: multipath Kdump: loaded Tainted: G W O 5.10.0-60.18.0.50.h478.eulerosv2r11.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-20220525_182517-szxrtosci10000 04/01/2014 RIP: 0010:kernfs_new_node+0x22/0x60 Code: cc cc 66 0f 1f 44 00 00 0f 1f 44 00 00 41 54 41 89 cb 0f b7 ca 48 89 f2 53 48 8b 47 08 48 89 fb 48 89 de 48 85 c0 48 0f 44 c7 <48> 8b 78 50 41 51 45 89 c1 45 89 d8 e8 4d ee ff ff 5a 49 89 c4 48 RSP: 0018:ffffa178419539e8 EFLAGS: 00010206 RAX: 000000011241ce77 RBX: ffff9596828395a0 RCX: 000000000000a1ff RDX: ffff9595ada828b0 RSI: ffff9596828395a0 RDI: ffff9596828395a0 RBP: ffff95959a9a2a80 R08: 0000000000000000 R09: 0000000000000004 R10: ffff9595ca0bf930 R11: 0000000000000000 R12: ffff9595ada828b0 R13: ffff9596828395a0 R14: 0000000000000001 R15: ffff9595948c5c80 FS: 00007f64baa10200(0000) GS:ffff9596bad80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000011241cec7 CR3: 000000011923e003 CR4: 0000000000170ee0 Call Trace: kernfs_create_link+0x31/0xa0 sysfs_do_create_link_sd+0x61/0xc0 bd_link_disk_holder+0x10a/0x180 dm_get_table_device+0x10b/0x1f0 [dm_mod] __dm_get_device+0x1e2/0x280 [dm_mod] ? kmem_cache_alloc_trace+0x2fb/0x410 parse_path+0xca/0x200 [dm_multipath] parse_priority_group+0x19d/0x1f0 [dm_multipath] multipath_ctr+0x27a/0x491 [dm_multipath] dm_table_add_target+0x177/0x360 [dm_mod] table_load+0x12b/0x380 [dm_mod] ctl_ioctl+0x199/0x290 [dm_mod] ? dev_suspend+0xd0/0xd0 [dm_mod] dm_ctl_ioctl+0xa/0x20 [dm_mod] __se_sys_ioctl+0x85/0xc0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6
This can be easy reproduce: Add delay before ret = add_symlink(bdev->bd_part->holder_dir...) in bd_link_disk_holder() dmsetup create xxx --tabel "0 1000 linear /dev/sda 0" echo 1 > /sys/block/sda/device/delete
Delete /dev/sda will release holder_dir, but add_symlink() will use holder_dir. Therefore UAF will occur in this case.
Fix this problem by adding reference count to holder_dir.
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/block_dev.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c index 9868b21b8ef9..7e891e08d0ce 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1650,6 +1650,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } bdev->bd_openers++; + kobject_get(bdev->bd_part->holder_dir); if (for_part) bdev->bd_part_count++; if (mode & FMODE_WRITE) @@ -1925,6 +1926,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (for_part) bdev->bd_part_count--;
+ kobject_put(bdev->bd_part->holder_dir); if (!--bdev->bd_openers) { WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev);