From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-5.5-rc1 commit 206aefde4f886fdeb3b6339aacab3a85fb74cb7e category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=27 CVE: NA ---------------------------
With the recent flurry of additions and changes to io_uring, the layout of io_ring_ctx has become a bit stale. We're right now at 704 bytes in size on my x86-64 build, or 11 cachelines. This patch does two things:
- We have to completion structs embedded, that we only use for quiesce of the ctx (or shutdown) and for sqthread init cases. That 2x32 bytes right there, let's dynamically allocate them.
- Reorder the struct a bit with an eye on cachelines, use cases, and holes.
With this patch, we're down to 512 bytes, or 8 cachelines.
Reviewed-by: Jackie Liu liuyun01@kylinos.cn Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: fs/io_uring.c [ Patch 214828962de("io_uring: initialize percpu refcounters using PERCU_REF_ALLOW_REINIT") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- fs/io_uring.c | 69 ++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 31 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f5edbb83f86..914a999a458b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -204,6 +204,7 @@ struct io_ring_ctx { unsigned sq_mask; unsigned sq_thread_idle; unsigned cached_sq_dropped; + atomic_t cached_cq_overflow; struct io_uring_sqe *sq_sqes;
struct list_head defer_list; @@ -213,25 +214,13 @@ struct io_ring_ctx { wait_queue_head_t inflight_wait; } ____cacheline_aligned_in_smp;
+ struct io_rings *rings; + /* IO offload */ struct io_wq *io_wq; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; - struct completion sqo_thread_started; - - struct { - unsigned cached_cq_tail; - atomic_t cached_cq_overflow; - unsigned cq_entries; - unsigned cq_mask; - struct wait_queue_head cq_wait; - struct fasync_struct *cq_fasync; - struct eventfd_ctx *cq_ev_fd; - atomic_t cq_timeouts; - } ____cacheline_aligned_in_smp; - - struct io_rings *rings;
/* * If used, fixed file set. Writers must ensure that ->refs is dead, @@ -247,7 +236,22 @@ struct io_ring_ctx {
struct user_struct *user;
- struct completion ctx_done; + /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ + struct completion *completions; + +#if defined(CONFIG_UNIX) + struct socket *ring_sock; +#endif + + struct { + unsigned cached_cq_tail; + unsigned cq_entries; + unsigned cq_mask; + atomic_t cq_timeouts; + struct wait_queue_head cq_wait; + struct fasync_struct *cq_fasync; + struct eventfd_ctx *cq_ev_fd; + } ____cacheline_aligned_in_smp;
struct { struct mutex uring_lock; @@ -269,10 +273,6 @@ struct io_ring_ctx { spinlock_t inflight_lock; struct list_head inflight_list; } ____cacheline_aligned_in_smp; - -#if defined(CONFIG_UNIX) - struct socket *ring_sock; -#endif };
struct sqe_submit { @@ -397,7 +397,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
- complete(&ctx->ctx_done); + complete(&ctx->completions[0]); }
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -408,16 +408,18 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx) return NULL;
- if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) { - kfree(ctx); - return NULL; - } + ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL); + if (!ctx->completions) + goto err; + + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) + goto err;
ctx->flags = p->flags; init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); - init_completion(&ctx->ctx_done); - init_completion(&ctx->sqo_thread_started); + init_completion(&ctx->completions[0]); + init_completion(&ctx->completions[1]); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); @@ -429,6 +431,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); return ctx; +err: + kfree(ctx->completions); + kfree(ctx); + return NULL; }
static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, @@ -3046,7 +3052,7 @@ static int io_sq_thread(void *data) unsigned inflight; unsigned long timeout;
- complete(&ctx->sqo_thread_started); + complete(&ctx->completions[1]);
old_fs = get_fs(); set_fs(USER_DS); @@ -3286,7 +3292,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { - wait_for_completion(&ctx->sqo_thread_started); + wait_for_completion(&ctx->completions[1]); /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity @@ -4109,6 +4115,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_unaccount_mem(ctx->user, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); + kfree(ctx->completions); kfree(ctx); }
@@ -4153,7 +4160,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_iopoll_reap_events(ctx); io_cqring_overflow_flush(ctx, true); - wait_for_completion(&ctx->ctx_done); + wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); }
@@ -4556,7 +4563,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * no new references will come in after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->ctx_done); + wait_for_completion(&ctx->completions[0]); mutex_lock(&ctx->uring_lock);
switch (opcode) { @@ -4599,7 +4606,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, }
/* bring the ctx back to life */ - reinit_completion(&ctx->ctx_done); + reinit_completion(&ctx->completions[0]); percpu_ref_reinit(&ctx->refs); return ret; }