From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-5.4-rc1 commit 54a91f3bb9b96ed86bc12b2f7e06b3fce8e86503 category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=27 CVE: NA ---------------------------
All the popular filesystems need to grab the inode lock for buffered writes. With io_uring punting buffered writes to async context, we observe a lot of contention with all workers hamming this mutex.
For buffered writes, we generally don't need a lot of parallelism on the submission side, as the flushing will take care of that for us. Hence we don't need a deep queue on the write side, as long as we can safely punt from the original submission context.
Add a workqueue with a limit of 2 that we can use for buffered writes. This greatly improves the performance and efficiency of higher queue depth buffered async writes with io_uring.
Reported-by: Andres Freund andres@anarazel.de Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- fs/io_uring.c | 47 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index b23a4f3b6c61..2bd3c4cc1394 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -203,7 +203,7 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp;
/* IO offload */ - struct workqueue_struct *sqo_wq; + struct workqueue_struct *sqo_wq[2]; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; @@ -445,7 +445,19 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) static inline void io_queue_async_work(struct io_ring_ctx *ctx, struct io_kiocb *req) { - queue_work(ctx->sqo_wq, &req->work); + int rw; + + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + default: + rw = 0; + break; + } + + queue_work(ctx->sqo_wq[rw], &req->work); }
static void io_commit_cqring(struct io_ring_ctx *ctx) @@ -2633,11 +2645,15 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
static void io_finish_async(struct io_ring_ctx *ctx) { + int i; + io_sq_thread_stop(ctx);
- if (ctx->sqo_wq) { - destroy_workqueue(ctx->sqo_wq); - ctx->sqo_wq = NULL; + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) { + if (ctx->sqo_wq[i]) { + destroy_workqueue(ctx->sqo_wq[i]); + ctx->sqo_wq[i] = NULL; + } } }
@@ -2845,16 +2861,31 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, }
/* Do QD, or 2 * CPUS, whatever is smallest */ - ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", + WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); - if (!ctx->sqo_wq) { + if (!ctx->sqo_wq[0]) { + ret = -ENOMEM; + goto err; + } + + /* + * This is for buffered writes, where we want to limit the parallelism + * due to file locking in file systems. As "normal" buffered writes + * should parellelize on writeout quite nicely, limit us to having 2 + * pending. This avoids massive contention on the inode when doing + * buffered async writes. + */ + ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", + WQ_UNBOUND | WQ_FREEZABLE, 2); + if (!ctx->sqo_wq[1]) { ret = -ENOMEM; goto err; }
return 0; err: - io_sq_thread_stop(ctx); + io_finish_async(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret;