From: Yu Kuai yukuai3@huawei.com
Changes in v2: - fix the problem that if 'nr_pending' is decreased to 0 in wait_barrier_nolock() in patch 1, 'conf->barrier' is not waked, and raise_barrier() can hang while waiting 'nr_pending' to be 0. - only modify hot path in patch 2. - use node 1 as default in patch 6.
In some architecture, for example KUNPENG 920, memory access latency is very bad across nodes compare to local node. For consequence, io performance is rather bad while users issue io from multiple nodes if lock contention exist in the driver.
This patchset tries to avoid memory access across nodes in driver.
Test environment: aarch64 Huawei KUNPENG 920
Raid10 initialize: mdadm --create /dev/md0 --level 10 --bitmap none --raid-devices 4 /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1
Test cmd: (task set -c 0-15) fio -name=0 -ioengine=libaio -direct=1 -group_reporting=1 -randseed=2022 -rwmixread=70 -refill_buffers -filename=/dev/md0 -numjobs=16 -runtime=60s -bs=4k -iodepth=256 -rw=randread
Test result:
before this patchset: 3.2 GiB/s bind node before this patchset: 6.9 Gib/s after this patchset: 7.9 Gib/s bind node after this patchset: 8.0 Gib/s
Wang ShaoBo (1): arm64/topology: Getting preferred sibling's cpumask supported by platform
Yu Kuai (5): md/raid10: convert resync_lock to use seqlock md/raid10: prevent unnecessary calls to wake_up() in fast path block: add new fields in request_queue block: support to dispatch bio asynchronously md: enable dispatching bio asynchronously by default
arch/arm64/Kconfig | 8 ++ arch/arm64/include/asm/smp_plat.h | 14 ++ arch/arm64/kernel/smp.c | 9 ++ arch/arm64/kernel/topology.c | 51 +++++++ block/blk-core.c | 212 +++++++++++++++++++++++++++++- block/blk-sysfs.c | 40 ++++++ drivers/md/md.c | 5 + drivers/md/raid10.c | 98 +++++++++----- drivers/md/raid10.h | 2 +- include/linux/arch_topology.h | 7 + include/linux/blkdev.h | 6 + 11 files changed, 420 insertions(+), 32 deletions(-)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier', and io can't be dispatched until 'barrier' is dropped.
Since holding the 'barrier' is not common, convert 'resync_lock' to use seqlock so that holding lock can be avoided in fast path.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- drivers/md/raid10.c | 88 ++++++++++++++++++++++++++++++--------------- drivers/md/raid10.h | 2 +- 2 files changed, 61 insertions(+), 29 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5049c4d829e5..2d4192872714 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -112,6 +112,21 @@ static void end_reshape(struct r10conf *conf);
#include "raid1-10.c"
+#define NULL_CMD +#define cmd_before(conf, cmd) \ + do { \ + write_sequnlock_irq(&(conf)->resync_lock); \ + cmd; \ + } while (0) +#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) + +#define wait_event_barrier_cmd(conf, cond, cmd) \ + wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ + cmd_after(conf)) + +#define wait_event_barrier(conf, cond) \ + wait_event_barrier_cmd(conf, cond, NULL_CMD) + /* * for resync bio, r10bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -965,35 +980,54 @@ static void flush_pending_writes(struct r10conf *conf) static void raise_barrier(struct r10conf *conf, int force) { BUG_ON(force && !conf->barrier); - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock);
/* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock); + wait_event_barrier(conf, force || !conf->nr_waiting);
/* block any new IO from starting */ - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, - conf->resync_lock); + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH);
- spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static void lower_barrier(struct r10conf *conf) { unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + + write_seqlock_irqsave(&conf->resync_lock, flags); + WRITE_ONCE(conf->barrier, conf->barrier - 1); + write_sequnlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); }
+static bool wait_barrier_nolock(struct r10conf *conf) +{ + unsigned int seq = read_seqbegin(&conf->resync_lock); + + if (READ_ONCE(conf->barrier)) + return false; + + atomic_inc(&conf->nr_pending); + if (!read_seqretry(&conf->resync_lock, seq)) + return true; + + if (atomic_dec_and_test(&conf->nr_pending)) + wake_up_barrier(conf); + + return false; +} + static void wait_barrier(struct r10conf *conf) { - spin_lock_irq(&conf->resync_lock); + if (wait_barrier_nolock(conf)) + return; + + write_seqlock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; /* Wait for the barrier to drop. @@ -1006,19 +1040,18 @@ static void wait_barrier(struct r10conf *conf) * count down. */ raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, + wait_event_barrier(conf, !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))), - conf->resync_lock); + !bio_list_empty(¤t->bio_list[1])))); conf->nr_waiting--; if (!conf->nr_waiting) wake_up(&conf->wait_barrier); } atomic_inc(&conf->nr_pending); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static void allow_barrier(struct r10conf *conf) @@ -1042,27 +1075,26 @@ static void freeze_array(struct r10conf *conf, int extra) * must match the number of pending IOs (nr_pending) before * we continue. */ - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock); conf->array_freeze_pending++; - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); + wait_event_barrier_cmd(conf, + atomic_read(&conf->nr_pending) == conf->nr_queued+extra, + flush_pending_writes(conf));
conf->array_freeze_pending--; - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static void unfreeze_array(struct r10conf *conf) { /* reverse the effect of the freeze */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; + write_seqlock_irq(&conf->resync_lock); + WRITE_ONCE(conf->barrier, conf->barrier - 1); conf->nr_waiting--; wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); }
static sector_t choose_data_offset(struct r10bio *r10_bio, @@ -3734,7 +3766,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list);
- spin_lock_init(&conf->resync_lock); + seqlock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); atomic_set(&conf->nr_pending, 0);
@@ -4074,7 +4106,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - conf->barrier = 1; + WRITE_ONCE(conf->barrier, 1); }
return conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index d3eaaf3eb1bc..e368a92f37fd 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -77,7 +77,7 @@ struct r10conf { struct bio_list pending_bio_list; int pending_count;
- spinlock_t resync_lock; + seqlock_t resync_lock; atomic_t nr_pending; int nr_waiting; int nr_queued;
Reviewed-by: Jason Yan yanaijie@huawei.com
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
Currently, wake_up() is called unconditionally in fast path such as raid10_make_request(), which will cause lock contention under high concurrency:
raid10_make_request wake_up __wake_up_common_lock spin_lock_irqsave
Improve performance by only call wake_up() if waitqueue is not empty.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- drivers/md/raid10.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2d4192872714..f8fea9593955 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -320,6 +320,12 @@ static void put_buf(struct r10bio *r10_bio) lower_barrier(conf); }
+static void wake_up_barrier(struct r10conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + static void reschedule_retry(struct r10bio *r10_bio) { unsigned long flags; @@ -1058,7 +1064,7 @@ static void allow_barrier(struct r10conf *conf) { if ((atomic_dec_and_test(&conf->nr_pending)) || (conf->array_freeze_pending)) - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); }
static void freeze_array(struct r10conf *conf, int extra) @@ -1616,7 +1622,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) __make_request(mddev, bio, sectors);
/* In case raid10d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); return true; }
Reviewed-by: Jason Yan yanaijie@huawei.com
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
Add a new flag QUEUE_FLAG_DISPATCH_ASYNC and two new fields 'dispatch_cpumask' and 'last_dispatch_cpu' for request_queue, prepare to support dispatch bio asynchronous in specified cpus. This patch also add sysfs apis.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 19 +++++++++++++++++++ block/blk-sysfs.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 6 ++++++ 3 files changed, 65 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c index e98827d25ef8..fc81dff50a34 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1018,6 +1018,22 @@ static void blk_timeout_work_empty(struct work_struct *work) { }
+static int blk_alloc_queue_dispatch_async(struct request_queue *q) +{ + int cpu; + + q->last_dispatch_cpu = alloc_percpu(int); + if (!q->last_dispatch_cpu) + return -ENOMEM; + + cpumask_setall(&q->dispatch_async_cpus); + for_each_possible_cpu(cpu) { + *per_cpu_ptr(q->last_dispatch_cpu, cpu) = cpu; + } + + return 0; +} + /** * blk_alloc_queue_node - allocate a request queue * @gfp_mask: memory allocation flags @@ -1049,6 +1065,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, q->end_sector = 0; q->boundary_rq = NULL;
+ if (blk_alloc_queue_dispatch_async(q)) + goto fail_q; + q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) goto fail_q; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 30898a7855d7..55b22d66672c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -696,6 +696,42 @@ static struct queue_sysfs_entry queue_wb_lat_entry = { .store = queue_wb_lat_store, };
+static ssize_t queue_dispatch_async_cpus_show(struct request_queue *q, + char *page) +{ + int cpu; + ssize_t ret = 0; + + if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags)) + return -EOPNOTSUPP; + + for_each_cpu(cpu, &q->dispatch_async_cpus) { + ret += sprintf(page + ret, "%d ", cpu); + } + + ret += sprintf(page + ret, "\n"); + return ret; +} + +static struct queue_sysfs_entry queue_dispatch_async_cpus_entry = { + .attr = {.name = "dispatch_async_cpus", .mode = 0444 }, + .show = queue_dispatch_async_cpus_show, +}; + +static ssize_t queue_show_dispatch_async(struct request_queue *q, + char *page) +{ + if (test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags)) + return sprintf(page, "1\n"); + else + return sprintf(page, "0\n"); +} + +static struct queue_sysfs_entry queue_dispatch_async_entry = { + .attr = {.name = "dispatch_async", .mode = 0444 }, + .show = queue_show_dispatch_async, +}; + #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static struct queue_sysfs_entry throtl_sample_time_entry = { .attr = {.name = "throttle_sample_time", .mode = 0644 }, @@ -738,6 +774,8 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, + &queue_dispatch_async_cpus_entry.attr, + &queue_dispatch_async_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif @@ -820,6 +858,8 @@ static void __blk_release_queue(struct work_struct *work) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb);
+ free_percpu(q->last_dispatch_cpu); + if (!blk_queue_dead(q)) { /* * Last reference was dropped without having called diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1deaf36eb237..fd1fc4670f31 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -701,6 +701,10 @@ struct request_queue {
struct work_struct release_work;
+ /* used when QUEUE_FLAG_DISPATCH_ASYNC is set */ + struct cpumask dispatch_async_cpus; + int __percpu *last_dispatch_cpu; + #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; @@ -739,6 +743,8 @@ struct request_queue { #define QUEUE_FLAG_FORECE_QUIESCE 29 /* force quiesce when cleanup queue */ /* queue has bee quiesced, used in block layer */ #define QUEUE_FLAG_QUIESCED_INTERNAL 30 +/* bio will be dispatched asynchronous */ +#define QUEUE_FLAG_DISPATCH_ASYNC 31
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \
On 2022/9/16 16:33, Yu Kuai wrote:
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
Add a new flag QUEUE_FLAG_DISPATCH_ASYNC and two new fields 'dispatch_cpumask' and 'last_dispatch_cpu' for request_queue, prepare to support dispatch bio asynchronous in specified cpus. This patch also add sysfs apis.
Signed-off-by: Yu Kuai yukuai3@huawei.com
block/blk-core.c | 19 +++++++++++++++++++ block/blk-sysfs.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 6 ++++++ 3 files changed, 65 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c index e98827d25ef8..fc81dff50a34 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1018,6 +1018,22 @@ static void blk_timeout_work_empty(struct work_struct *work) { }
+static int blk_alloc_queue_dispatch_async(struct request_queue *q) +{
- int cpu;
- q->last_dispatch_cpu = alloc_percpu(int);
- if (!q->last_dispatch_cpu)
return -ENOMEM;
- cpumask_setall(&q->dispatch_async_cpus);
- for_each_possible_cpu(cpu) {
*per_cpu_ptr(q->last_dispatch_cpu, cpu) = cpu;
- }
- return 0;
+}
- /**
- blk_alloc_queue_node - allocate a request queue
- @gfp_mask: memory allocation flags
@@ -1049,6 +1065,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, q->end_sector = 0; q->boundary_rq = NULL;
- if (blk_alloc_queue_dispatch_async(q))
goto fail_q;
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) goto fail_q;
上面多加了内存分配,这里失败了以后需要释放上面的percpu内存。其他错误路径 也一样。
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 30898a7855d7..55b22d66672c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -696,6 +696,42 @@ static struct queue_sysfs_entry queue_wb_lat_entry = { .store = queue_wb_lat_store, };
+static ssize_t queue_dispatch_async_cpus_show(struct request_queue *q,
char *page)
+{
- int cpu;
- ssize_t ret = 0;
- if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags))
return -EOPNOTSUPP;
- for_each_cpu(cpu, &q->dispatch_async_cpus) {
ret += sprintf(page + ret, "%d ", cpu);
- }
- ret += sprintf(page + ret, "\n");
- return ret;
+}
+static struct queue_sysfs_entry queue_dispatch_async_cpus_entry = {
- .attr = {.name = "dispatch_async_cpus", .mode = 0444 },
- .show = queue_dispatch_async_cpus_show,
+};
+static ssize_t queue_show_dispatch_async(struct request_queue *q,
char *page)
+{
- if (test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags))
return sprintf(page, "1\n");
- else
return sprintf(page, "0\n");
+}
+static struct queue_sysfs_entry queue_dispatch_async_entry = {
- .attr = {.name = "dispatch_async", .mode = 0444 },
- .show = queue_show_dispatch_async,
+};
- #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static struct queue_sysfs_entry throtl_sample_time_entry = { .attr = {.name = "throttle_sample_time", .mode = 0644 },
@@ -738,6 +774,8 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr,
- &queue_dispatch_async_cpus_entry.attr,
- &queue_dispatch_async_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif
@@ -820,6 +858,8 @@ static void __blk_release_queue(struct work_struct *work) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb);
- free_percpu(q->last_dispatch_cpu);
- if (!blk_queue_dead(q)) { /*
- Last reference was dropped without having called
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1deaf36eb237..fd1fc4670f31 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -701,6 +701,10 @@ struct request_queue {
struct work_struct release_work;
- /* used when QUEUE_FLAG_DISPATCH_ASYNC is set */
- struct cpumask dispatch_async_cpus;
- int __percpu *last_dispatch_cpu;
这里不用管kabi吗
#define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; @@ -739,6 +743,8 @@ struct request_queue { #define QUEUE_FLAG_FORECE_QUIESCE 29 /* force quiesce when cleanup queue */ /* queue has bee quiesced, used in block layer */ #define QUEUE_FLAG_QUIESCED_INTERNAL 30 +/* bio will be dispatched asynchronous */ +#define QUEUE_FLAG_DISPATCH_ASYNC 31
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
In some architecture memory access latency is very bad across nodes compare to local node. For consequence, io performance is rather bad while users issue io from multiple nodes if lock contention exist in the driver.
This patch make io dispatch asynchronously to specific kthread that is bind to cpus that are belong to the same node, so that memory access across nodes in driver can be avoided.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index fc81dff50a34..69bea96fab90 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -85,6 +85,29 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue;
+/* prevent false sharing */ +#define BIO_ASYNC_LIST_SHIFT 2 +#define BIO_ASYNC_LOCK_SHIFT 4 +#define bio_async_list(ctl, i) (&ctl->list[i << BIO_ASYNC_LIST_SHIFT]) +#define bio_async_lock(ctl, i) (&ctl->lock[i << BIO_ASYNC_LOCK_SHIFT]) + +struct bio_dispatch_async_ctl { + /* + * Vector size is nr_cpu_ids, list stores bio dispatched from other cpu, + * such bio will be dispatched asynchronously to the cpu this structure + * is serviced. + */ + struct bio_list *list; + /* list is protected by lock */ + spinlock_t *lock; + /* kthread to dispatch bio asynchronously */ + struct task_struct *thread; + /* thread will wait here if there are no bios in list */ + wait_queue_head_t wait; +}; + +static struct bio_dispatch_async_ctl __percpu **bio_dispatch_async_ctl; + /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set @@ -2295,6 +2318,133 @@ static inline int blk_partition_remap(struct bio *bio) return ret; }
+static int collect_bio(struct bio_dispatch_async_ctl *ctl, + struct bio_list *list) +{ + int count = 0; + int cpu; + struct bio *bio; + + for_each_possible_cpu(cpu) { + spin_lock_irq(bio_async_lock(ctl, cpu)); + while ((bio = bio_list_pop(bio_async_list(ctl, cpu)))) { + bio_list_add(list, bio); + count++; + } + spin_unlock_irq(bio_async_lock(ctl, cpu)); + } + + return count; +} + +#define BIO_DISPATCH_MAX_LOOP 16 +static int bio_dispatch_work(void *data) +{ + int loop_count = 0; + int cpu = get_cpu(); + struct bio_dispatch_async_ctl *ctl = + *per_cpu_ptr(bio_dispatch_async_ctl, cpu); + + for (;; loop_count++) { + struct bio_list bio_list_on_stack; + struct blk_plug plug; + struct bio *bio; + int count; + + bio_list_init(&bio_list_on_stack); + count = collect_bio(ctl, &bio_list_on_stack); + + if (!count) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&ctl->wait, &wait, + TASK_INTERRUPTIBLE); + count = collect_bio(ctl, &bio_list_on_stack); + if (count) + break; + schedule(); + loop_count = 0; + } + finish_wait(&ctl->wait, &wait); + + } + + blk_start_plug(&plug); + while ((bio = bio_list_pop(&bio_list_on_stack))) { + struct request_queue *q = bio->bi_disk->queue; + + q->make_request_fn(q, bio); + } + blk_finish_plug(&plug); + + /* prevent soft lockup */ + if (loop_count >= BIO_DISPATCH_MAX_LOOP) { + loop_count = 0; + cond_resched(); + } + } + + put_cpu(); + return 0; +} + +static int get_dispatch_cpu(struct request_queue *q, int cpu) +{ + int *last_dispatch_cpu = per_cpu_ptr(q->last_dispatch_cpu, cpu); + + cpu = cpumask_next(*last_dispatch_cpu, &q->dispatch_async_cpus); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(&q->dispatch_async_cpus); + + *last_dispatch_cpu = cpu; + + return cpu; +} + +static void blk_queue_make_request_async(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + int cpu = get_cpu(); + int dispatch_cpu = get_dispatch_cpu(q, cpu); + struct bio_dispatch_async_ctl *ctl = + *per_cpu_ptr(bio_dispatch_async_ctl, dispatch_cpu); + + spin_lock_irq(bio_async_lock(ctl, cpu)); + bio_list_add(bio_async_list(ctl, cpu), bio); + spin_unlock_irq(bio_async_lock(ctl, cpu)); + + if (wq_has_sleeper(&ctl->wait)) + wake_up(&ctl->wait); + + put_cpu(); +} + +static blk_qc_t blk_queue_do_make_request(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + int cpu = get_cpu(); + + put_cpu(); + + /* + * Don't dispatch bio asynchronously in following cases: + * + * 1) QUEUE_FLAG_DISPATCH_ASYNC is not set; + * 2) current cpu is the target cpu; + * 3) bio is flagged no wait; + * 4) TODO: return value of submit_bio() will be used in io polling. + */ + if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags) || + cpumask_test_cpu(cpu, &q->dispatch_async_cpus) || + bio->bi_opf & REQ_NOWAIT) + return q->make_request_fn(q, bio); + + /* return value is not concerned */ + blk_queue_make_request_async(bio); + return BLK_QC_T_NONE; +} + static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { @@ -2507,7 +2657,7 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = q->make_request_fn(q, bio); + ret = blk_queue_do_make_request(bio);
/* sort new bios into those for a lower level * and those for the same level @@ -4026,6 +4176,45 @@ void blk_set_runtime_active(struct request_queue *q) EXPORT_SYMBOL(blk_set_runtime_active); #endif
+static void init_blk_queue_async_dispatch(void) +{ + int cpu; + + bio_dispatch_async_ctl = alloc_percpu(struct bio_dispatch_async_ctl *); + if (!bio_dispatch_async_ctl) + panic("Failed to alloc bio_dispatch_async_ctl\n"); + + for_each_possible_cpu(cpu) { + int i; + struct bio_dispatch_async_ctl *ctl = + kmalloc(sizeof(struct bio_dispatch_async_ctl), + GFP_KERNEL | __GFP_NOFAIL); + + *per_cpu_ptr(bio_dispatch_async_ctl, cpu) = ctl; + + ctl->thread = + kthread_create_on_cpu(bio_dispatch_work, NULL, cpu, + "bio_dispatch_work_%u"); + if (IS_ERR_OR_NULL(ctl->thread)) + panic("Failed to create bio dispatch thread\n"); + + ctl->list = kmalloc_array(nr_cpu_ids, + sizeof(struct bio_list) << BIO_ASYNC_LIST_SHIFT, + GFP_KERNEL | __GFP_NOFAIL); + ctl->lock = kmalloc_array(nr_cpu_ids, + sizeof(spinlock_t) << BIO_ASYNC_LOCK_SHIFT, + GFP_KERNEL | __GFP_NOFAIL); + for (i = 0; i < nr_cpu_ids; ++i) { + bio_list_init(bio_async_list(ctl, i)); + spin_lock_init(bio_async_lock(ctl, i)); + } + + kthread_set_per_cpu(ctl->thread, cpu); + wake_up_process(ctl->thread); + init_waitqueue_head(&ctl->wait); + } +} + int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); @@ -4047,6 +4236,8 @@ int __init blk_dev_init(void) sizeof(struct request_queue_wrapper), 0, SLAB_PANIC, NULL);
+ init_blk_queue_async_dispatch(); + #ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); #endif
On 2022/9/16 16:33, Yu Kuai wrote:
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
In some architecture memory access latency is very bad across nodes compare to local node. For consequence, io performance is rather bad while users issue io from multiple nodes if lock contention exist in the driver.
This patch make io dispatch asynchronously to specific kthread that is bind to cpus that are belong to the same node, so that memory access across nodes in driver can be avoided.
Signed-off-by: Yu Kuai yukuai3@huawei.com
block/blk-core.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index fc81dff50a34..69bea96fab90 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -85,6 +85,29 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue;
+/* prevent false sharing */ +#define BIO_ASYNC_LIST_SHIFT 2 +#define BIO_ASYNC_LOCK_SHIFT 4 +#define bio_async_list(ctl, i) (&ctl->list[i << BIO_ASYNC_LIST_SHIFT]) +#define bio_async_lock(ctl, i) (&ctl->lock[i << BIO_ASYNC_LOCK_SHIFT])
+struct bio_dispatch_async_ctl {
- /*
* Vector size is nr_cpu_ids, list stores bio dispatched from other cpu,
* such bio will be dispatched asynchronously to the cpu this structure
* is serviced.
*/
- struct bio_list *list;
- /* list is protected by lock */
- spinlock_t *lock;
- /* kthread to dispatch bio asynchronously */
- struct task_struct *thread;
- /* thread will wait here if there are no bios in list */
- wait_queue_head_t wait;
+};
+static struct bio_dispatch_async_ctl __percpu **bio_dispatch_async_ctl;
- /**
- blk_queue_flag_set - atomically set a queue flag
- @flag: flag to be set
@@ -2295,6 +2318,133 @@ static inline int blk_partition_remap(struct bio *bio) return ret; }
+static int collect_bio(struct bio_dispatch_async_ctl *ctl,
struct bio_list *list)
+{
- int count = 0;
- int cpu;
- struct bio *bio;
- for_each_possible_cpu(cpu) {
spin_lock_irq(bio_async_lock(ctl, cpu));
while ((bio = bio_list_pop(bio_async_list(ctl, cpu)))) {
bio_list_add(list, bio);
count++;
}
spin_unlock_irq(bio_async_lock(ctl, cpu));
- }
- return count;
+}
+#define BIO_DISPATCH_MAX_LOOP 16 +static int bio_dispatch_work(void *data) +{
- int loop_count = 0;
- int cpu = get_cpu();
我看get_cpu会禁止抢占,所以这个线程一直禁着抢占,调度没有问题吗?
- struct bio_dispatch_async_ctl *ctl =
*per_cpu_ptr(bio_dispatch_async_ctl, cpu);
- for (;; loop_count++) {
struct bio_list bio_list_on_stack;
struct blk_plug plug;
struct bio *bio;
int count;
bio_list_init(&bio_list_on_stack);
count = collect_bio(ctl, &bio_list_on_stack);
if (!count) {
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&ctl->wait, &wait,
TASK_INTERRUPTIBLE);
count = collect_bio(ctl, &bio_list_on_stack);
if (count)
break;
schedule();
loop_count = 0;
}
finish_wait(&ctl->wait, &wait);
}
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bio_list_on_stack))) {
struct request_queue *q = bio->bi_disk->queue;
q->make_request_fn(q, bio);
}
blk_finish_plug(&plug);
/* prevent soft lockup */
if (loop_count >= BIO_DISPATCH_MAX_LOOP) {
loop_count = 0;
cond_resched();
}
- }
- put_cpu();
- return 0;
+}
+static int get_dispatch_cpu(struct request_queue *q, int cpu) +{
- int *last_dispatch_cpu = per_cpu_ptr(q->last_dispatch_cpu, cpu);
- cpu = cpumask_next(*last_dispatch_cpu, &q->dispatch_async_cpus);
- if (cpu >= nr_cpu_ids)
cpu = cpumask_first(&q->dispatch_async_cpus);
- *last_dispatch_cpu = cpu;
- return cpu;
+}
+static void blk_queue_make_request_async(struct bio *bio) +{
- struct request_queue *q = bio->bi_disk->queue;
- int cpu = get_cpu();
- int dispatch_cpu = get_dispatch_cpu(q, cpu);
- struct bio_dispatch_async_ctl *ctl =
*per_cpu_ptr(bio_dispatch_async_ctl, dispatch_cpu);
- spin_lock_irq(bio_async_lock(ctl, cpu));
- bio_list_add(bio_async_list(ctl, cpu), bio);
- spin_unlock_irq(bio_async_lock(ctl, cpu));
- if (wq_has_sleeper(&ctl->wait))
wake_up(&ctl->wait);
这里如果work线程没有等待(或者刚刚结束等待),会不会就没有人唤醒了?
- put_cpu();
+}
+static blk_qc_t blk_queue_do_make_request(struct bio *bio) +{
- struct request_queue *q = bio->bi_disk->queue;
- int cpu = get_cpu();
- put_cpu();
- /*
* Don't dispatch bio asynchronously in following cases:
*
* 1) QUEUE_FLAG_DISPATCH_ASYNC is not set;
* 2) current cpu is the target cpu;
* 3) bio is flagged no wait;
* 4) TODO: return value of submit_bio() will be used in io polling.
*/
- if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags) ||
cpumask_test_cpu(cpu, &q->dispatch_async_cpus) ||
bio->bi_opf & REQ_NOWAIT)
return q->make_request_fn(q, bio);
- /* return value is not concerned */
- blk_queue_make_request_async(bio);
- return BLK_QC_T_NONE;
+}
- static noinline_for_stack bool generic_make_request_checks(struct bio *bio) {
@@ -2507,7 +2657,7 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]);
ret = q->make_request_fn(q, bio);
ret = blk_queue_do_make_request(bio); /* sort new bios into those for a lower level * and those for the same level
@@ -4026,6 +4176,45 @@ void blk_set_runtime_active(struct request_queue *q) EXPORT_SYMBOL(blk_set_runtime_active); #endif
+static void init_blk_queue_async_dispatch(void) +{
- int cpu;
- bio_dispatch_async_ctl = alloc_percpu(struct bio_dispatch_async_ctl *);
- if (!bio_dispatch_async_ctl)
panic("Failed to alloc bio_dispatch_async_ctl\n");
- for_each_possible_cpu(cpu) {
int i;
struct bio_dispatch_async_ctl *ctl =
kmalloc(sizeof(struct bio_dispatch_async_ctl),
GFP_KERNEL | __GFP_NOFAIL);
*per_cpu_ptr(bio_dispatch_async_ctl, cpu) = ctl;
ctl->thread =
kthread_create_on_cpu(bio_dispatch_work, NULL, cpu,
"bio_dispatch_work_%u");
if (IS_ERR_OR_NULL(ctl->thread))
panic("Failed to create bio dispatch thread\n");
ctl->list = kmalloc_array(nr_cpu_ids,
sizeof(struct bio_list) << BIO_ASYNC_LIST_SHIFT,
GFP_KERNEL | __GFP_NOFAIL);
ctl->lock = kmalloc_array(nr_cpu_ids,
sizeof(spinlock_t) << BIO_ASYNC_LOCK_SHIFT,
GFP_KERNEL | __GFP_NOFAIL);
for (i = 0; i < nr_cpu_ids; ++i) {
bio_list_init(bio_async_list(ctl, i));
spin_lock_init(bio_async_lock(ctl, i));
}
kthread_set_per_cpu(ctl->thread, cpu);
wake_up_process(ctl->thread);
init_waitqueue_head(&ctl->wait);
- }
+}
- int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
@@ -4047,6 +4236,8 @@ int __init blk_dev_init(void) sizeof(struct request_queue_wrapper), 0, SLAB_PANIC, NULL);
- init_blk_queue_async_dispatch();
- #ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); #endif
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
Getting preferred sibling's cpumask supported by platform.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com --- arch/arm64/Kconfig | 8 +++++ arch/arm64/include/asm/smp_plat.h | 14 +++++++++ arch/arm64/kernel/smp.c | 9 ++++++ arch/arm64/kernel/topology.c | 51 +++++++++++++++++++++++++++++++ include/linux/arch_topology.h | 7 +++++ 5 files changed, 89 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 80ab9c9dd43c..c68dd7d7f054 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1389,6 +1389,14 @@ config RANDOMIZE_MODULE_REGION_FULL a limited range that contains the [_stext, _etext] interval of the core kernel, so branch relocations are always in range.
+config ARCH_GET_PREFERRED_SIBLING_CPUMASK + bool "Get preferred sibling cpumask from mpidr" + depends on ARM64 + default y + help + Get preferred sibling cpumask given by mpidr, this hardly depends on + the specific implementation of the specific platform. + menuconfig ASCEND_FEATURES bool "Support Ascend Features" depends on ARM64 diff --git a/arch/arm64/include/asm/smp_plat.h b/arch/arm64/include/asm/smp_plat.h index af58dcdefb21..63e29335f426 100644 --- a/arch/arm64/include/asm/smp_plat.h +++ b/arch/arm64/include/asm/smp_plat.h @@ -56,4 +56,18 @@ static inline int get_logical_index(u64 mpidr) return -EINVAL; }
+#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK +void update_mpidr_siblings_masks(unsigned int cpu, bool remove); + +static inline void mpidr_siblings_add_cpu(unsigned int cpu) +{ + update_mpidr_siblings_masks(cpu, false); +} + +static inline void mpidr_siblings_remove_cpu(unsigned int cpu) +{ + update_mpidr_siblings_masks(cpu, true); +} +#endif + #endif /* __ASM_SMP_PLAT_H */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index cdb81a36be85..6b8bc313a87b 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -426,6 +426,9 @@ asmlinkage notrace void secondary_start_kernel(void)
store_cpu_topology(cpu); numa_add_cpu(cpu); +#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK + mpidr_siblings_add_cpu(cpu); +#endif
/* * OK, now it's safe to let the boot CPU continue. Wait for @@ -481,6 +484,9 @@ int __cpu_disable(void)
remove_cpu_topology(cpu); numa_remove_cpu(cpu); +#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK + mpidr_siblings_remove_cpu(cpu); +#endif
/* * Take this CPU offline. Once we clear this, we can't return, @@ -945,6 +951,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) store_cpu_topology(this_cpu); numa_store_cpu_info(this_cpu); numa_add_cpu(this_cpu); +#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK + mpidr_siblings_add_cpu(this_cpu); +#endif
/* * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 2646695e2f2a..b8446aad72ea 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -348,6 +348,57 @@ void remove_cpu_topology(unsigned int cpu) clear_cpu_topology(cpu); }
+#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK +#define MAX_MPIDR_SIBLINGS 100 +static struct cpumask mpidr_siblings_cpumask_map[MAX_MPIDR_SIBLINGS]; + +static void +__update_mpidr_siblings_masks(unsigned int cpu, int sibling, bool remove) +{ + if (WARN_ON_ONCE(sibling < 0 || sibling >= MAX_MPIDR_SIBLINGS)) + return; + + if (remove) + cpumask_clear_cpu(cpu, &mpidr_siblings_cpumask_map[sibling]); + else + cpumask_set_cpu(cpu, &mpidr_siblings_cpumask_map[sibling]); +} + +void update_mpidr_siblings_masks(unsigned int cpu, bool remove) +{ + int sibling, affinity; + u32 midr_impl = MIDR_IMPLEMENTOR(read_cpuid_id()); + u64 mpidr = read_cpuid_mpidr(); + bool mt = mpidr & MPIDR_MT_BITMASK; + + switch (midr_impl) { + case ARM_CPU_IMP_HISI: + if (mt && read_cpuid_part_number() == HISI_CPU_PART_TSV110) { + affinity = MPIDR_AFFINITY_LEVEL(mpidr, 2); + sibling = ((affinity >> 3) - 1) / 2; + __update_mpidr_siblings_masks(cpu, sibling, remove); + } + break; + default: + break; + } +} + +void arch_get_preferred_sibling_mask(unsigned int sibling, cpumask_var_t dstp) +{ + if (dstp == NULL) + return; + + if (sibling >= MAX_MPIDR_SIBLINGS) { + cpumask_clear(dstp); + return; + } + + cpumask_copy(dstp, &mpidr_siblings_cpumask_map[sibling]); +} +EXPORT_SYMBOL(arch_get_preferred_sibling_mask); +#endif + #ifdef CONFIG_ACPI static bool __init acpi_cpu_is_threaded(int cpu) { diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 2b709416de05..80ce8172b8b5 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -32,4 +32,11 @@ unsigned long topology_get_freq_scale(int cpu) return per_cpu(freq_scale, cpu); }
+#ifdef CONFIG_ARCH_GET_PREFERRED_SIBLING_CPUMASK +void arch_get_preferred_sibling_mask(unsigned int sibling, cpumask_var_t dstp); +#else +static inline void +arch_get_preferred_sibling_mask(unsigned int sibling, cpumask_var_t dstp) {} +#endif + #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: 187597, https://gitee.com/openeuler/kernel/issues/I5QK5M CVE: NA
--------------------------------
Try to improve performance for raid when user issues io concurrently from multiple nodes.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- drivers/md/md.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/md/md.c b/drivers/md/md.c index 9c94190769b8..f5f90e6d51e4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -66,6 +66,7 @@ #include <linux/raid/md_u.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> +#include <linux/arch_topology.h>
#include <trace/events/block.h> #include "md.h" @@ -5442,6 +5443,10 @@ static int md_alloc(dev_t dev, char *name) goto abort; mddev->queue->queuedata = mddev;
+ arch_get_preferred_sibling_mask(1, &mddev->queue->dispatch_async_cpus); + if (cpumask_first(&mddev->queue->dispatch_async_cpus) < nr_cpu_ids) + blk_queue_flag_set(QUEUE_FLAG_DISPATCH_ASYNC, mddev->queue); + blk_queue_make_request(mddev->queue, md_make_request); blk_set_stacking_limits(&mddev->queue->limits);
Reviewed-by: Jason Yan yanaijie@huawei.com