From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-5.2-rc2 commit 004d564f908790efe815a6510a542ac1227ef2a2 category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=27 CVE: NA ---------------------------
Various fixes and changes have been applied to liburing since we copied some select bits to the kernel testing/examples part, sync up with liburing to get those changes.
Most notable is the change that split the CQE reading into the peek and seen event, instead of being just a single function. Also fixes an unsigned wrap issue in io_uring_submit(), leak of 'fd' in setup if we fail, and various other little issues.
Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- tools/io_uring/io_uring-cp.c | 21 ++++++++---- tools/io_uring/liburing.h | 64 +++++++++++++++++++++++++++++------- tools/io_uring/queue.c | 36 ++++++++------------ tools/io_uring/setup.c | 10 ++++-- tools/io_uring/syscall.c | 48 +++++++++++++++++---------- 5 files changed, 118 insertions(+), 61 deletions(-)
diff --git a/tools/io_uring/io_uring-cp.c b/tools/io_uring/io_uring-cp.c index 633f65bb43a7..81461813ec62 100644 --- a/tools/io_uring/io_uring-cp.c +++ b/tools/io_uring/io_uring-cp.c @@ -13,6 +13,7 @@ #include <assert.h> #include <errno.h> #include <inttypes.h> +#include <sys/types.h> #include <sys/stat.h> #include <sys/ioctl.h>
@@ -85,11 +86,16 @@ static int queue_read(struct io_uring *ring, off_t size, off_t offset) struct io_uring_sqe *sqe; struct io_data *data;
+ data = malloc(size + sizeof(*data)); + if (!data) + return 1; + sqe = io_uring_get_sqe(ring); - if (!sqe) + if (!sqe) { + free(data); return 1; + }
- data = malloc(size + sizeof(*data)); data->read = 1; data->offset = data->first_offset = offset;
@@ -166,22 +172,23 @@ static int copy_file(struct io_uring *ring, off_t insize) struct io_data *data;
if (!got_comp) { - ret = io_uring_wait_completion(ring, &cqe); + ret = io_uring_wait_cqe(ring, &cqe); got_comp = 1; } else - ret = io_uring_get_completion(ring, &cqe); + ret = io_uring_peek_cqe(ring, &cqe); if (ret < 0) { - fprintf(stderr, "io_uring_get_completion: %s\n", + fprintf(stderr, "io_uring_peek_cqe: %s\n", strerror(-ret)); return 1; } if (!cqe) break;
- data = (struct io_data *) (uintptr_t) cqe->user_data; + data = io_uring_cqe_get_data(cqe); if (cqe->res < 0) { if (cqe->res == -EAGAIN) { queue_prepped(ring, data); + io_uring_cqe_seen(ring, cqe); continue; } fprintf(stderr, "cqe failed: %s\n", @@ -193,6 +200,7 @@ static int copy_file(struct io_uring *ring, off_t insize) data->iov.iov_len -= cqe->res; data->offset += cqe->res; queue_prepped(ring, data); + io_uring_cqe_seen(ring, cqe); continue; }
@@ -209,6 +217,7 @@ static int copy_file(struct io_uring *ring, off_t insize) free(data); writes--; } + io_uring_cqe_seen(ring, cqe); } }
diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h index cab0f50257ba..5f305c86b892 100644 --- a/tools/io_uring/liburing.h +++ b/tools/io_uring/liburing.h @@ -1,10 +1,16 @@ #ifndef LIB_URING_H #define LIB_URING_H
+#ifdef __cplusplus +extern "C" { +#endif + #include <sys/uio.h> #include <signal.h> #include <string.h> #include "../../include/uapi/linux/io_uring.h" +#include <inttypes.h> +#include "barrier.h"
/* * Library interface to io_uring @@ -46,7 +52,7 @@ struct io_uring { * System calls */ extern int io_uring_setup(unsigned entries, struct io_uring_params *p); -extern int io_uring_enter(unsigned fd, unsigned to_submit, +extern int io_uring_enter(int fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig); extern int io_uring_register(int fd, unsigned int opcode, void *arg, unsigned int nr_args); @@ -59,13 +65,32 @@ extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring); extern void io_uring_queue_exit(struct io_uring *ring); -extern int io_uring_get_completion(struct io_uring *ring, +extern int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr); -extern int io_uring_wait_completion(struct io_uring *ring, +extern int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr); extern int io_uring_submit(struct io_uring *ring); extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
+/* + * Must be called after io_uring_{peek,wait}_cqe() after the cqe has + * been processed by the application. + */ +static inline void io_uring_cqe_seen(struct io_uring *ring, + struct io_uring_cqe *cqe) +{ + if (cqe) { + struct io_uring_cq *cq = &ring->cq; + + (*cq->khead)++; + /* + * Ensure that the kernel sees our new head, the kernel has + * the matching read barrier. + */ + write_barrier(); + } +} + /* * Command prep helpers */ @@ -74,8 +99,14 @@ static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data) sqe->user_data = (unsigned long) data; }
+static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe) +{ + return (void *) (uintptr_t) cqe->user_data; +} + static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, - void *addr, unsigned len, off_t offset) + const void *addr, unsigned len, + off_t offset) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = op; @@ -86,8 +117,8 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, }
static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, - struct iovec *iovecs, unsigned nr_vecs, - off_t offset) + const struct iovec *iovecs, + unsigned nr_vecs, off_t offset) { io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); } @@ -100,14 +131,14 @@ static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, }
static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, - struct iovec *iovecs, unsigned nr_vecs, - off_t offset) + const struct iovec *iovecs, + unsigned nr_vecs, off_t offset) { io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); }
static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, - void *buf, unsigned nbytes, + const void *buf, unsigned nbytes, off_t offset) { io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); @@ -131,13 +162,22 @@ static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe, }
static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, - int datasync) + unsigned fsync_flags) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = IORING_OP_FSYNC; sqe->fd = fd; - if (datasync) - sqe->fsync_flags = IORING_FSYNC_DATASYNC; + sqe->fsync_flags = fsync_flags; +} + +static inline void io_uring_prep_nop(struct io_uring_sqe *sqe) +{ + memset(sqe, 0, sizeof(*sqe)); + sqe->opcode = IORING_OP_NOP; +} + +#ifdef __cplusplus } +#endif
#endif diff --git a/tools/io_uring/queue.c b/tools/io_uring/queue.c index 88505e873ad9..321819c132c7 100644 --- a/tools/io_uring/queue.c +++ b/tools/io_uring/queue.c @@ -8,8 +8,8 @@ #include "liburing.h" #include "barrier.h"
-static int __io_uring_get_completion(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, int wait) +static int __io_uring_get_cqe(struct io_uring *ring, + struct io_uring_cqe **cqe_ptr, int wait) { struct io_uring_cq *cq = &ring->cq; const unsigned mask = *cq->kring_mask; @@ -39,34 +39,25 @@ static int __io_uring_get_completion(struct io_uring *ring, return -errno; } while (1);
- if (*cqe_ptr) { - *cq->khead = head + 1; - /* - * Ensure that the kernel sees our new head, the kernel has - * the matching read barrier. - */ - write_barrier(); - } - return 0; }
/* - * Return an IO completion, if one is readily available + * Return an IO completion, if one is readily available. Returns 0 with + * cqe_ptr filled in on success, -errno on failure. */ -int io_uring_get_completion(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr) +int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { - return __io_uring_get_completion(ring, cqe_ptr, 0); + return __io_uring_get_cqe(ring, cqe_ptr, 0); }
/* - * Return an IO completion, waiting for it if necessary + * Return an IO completion, waiting for it if necessary. Returns 0 with + * cqe_ptr filled in on success, -errno on failure. */ -int io_uring_wait_completion(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr) +int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { - return __io_uring_get_completion(ring, cqe_ptr, 1); + return __io_uring_get_cqe(ring, cqe_ptr, 1); }
/* @@ -78,7 +69,7 @@ int io_uring_submit(struct io_uring *ring) { struct io_uring_sq *sq = &ring->sq; const unsigned mask = *sq->kring_mask; - unsigned ktail, ktail_next, submitted; + unsigned ktail, ktail_next, submitted, to_submit; int ret;
/* @@ -100,7 +91,8 @@ int io_uring_submit(struct io_uring *ring) */ submitted = 0; ktail = ktail_next = *sq->ktail; - while (sq->sqe_head < sq->sqe_tail) { + to_submit = sq->sqe_tail - sq->sqe_head; + while (to_submit--) { ktail_next++; read_barrier();
@@ -136,7 +128,7 @@ int io_uring_submit(struct io_uring *ring) if (ret < 0) return -errno;
- return 0; + return ret; }
/* diff --git a/tools/io_uring/setup.c b/tools/io_uring/setup.c index 4da19a77132c..0b50fcd78520 100644 --- a/tools/io_uring/setup.c +++ b/tools/io_uring/setup.c @@ -27,7 +27,7 @@ static int io_uring_mmap(int fd, struct io_uring_params *p, sq->kdropped = ptr + p->sq_off.dropped; sq->array = ptr + p->sq_off.array;
- size = p->sq_entries * sizeof(struct io_uring_sqe), + size = p->sq_entries * sizeof(struct io_uring_sqe); sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); @@ -79,7 +79,7 @@ int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) { struct io_uring_params p; - int fd; + int fd, ret;
memset(&p, 0, sizeof(p)); p.flags = flags; @@ -88,7 +88,11 @@ int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) if (fd < 0) return fd;
- return io_uring_queue_mmap(fd, &p, ring); + ret = io_uring_queue_mmap(fd, &p, ring); + if (ret) + close(fd); + + return ret; }
void io_uring_queue_exit(struct io_uring *ring) diff --git a/tools/io_uring/syscall.c b/tools/io_uring/syscall.c index 6b835e5c6a5b..b22e0aa54e9d 100644 --- a/tools/io_uring/syscall.c +++ b/tools/io_uring/syscall.c @@ -7,34 +7,46 @@ #include <signal.h> #include "liburing.h"
-#if defined(__x86_64) || defined(__i386__) -#ifndef __NR_sys_io_uring_setup -#define __NR_sys_io_uring_setup 425 -#endif -#ifndef __NR_sys_io_uring_enter -#define __NR_sys_io_uring_enter 426 -#endif -#ifndef __NR_sys_io_uring_register -#define __NR_sys_io_uring_register 427 -#endif -#else -#error "Arch not supported yet" +#ifdef __alpha__ +/* + * alpha is the only exception, all other architectures + * have common numbers for new system calls. + */ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 535 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 536 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 537 +# endif +#else /* !__alpha__ */ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 425 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 426 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 427 +# endif #endif
int io_uring_register(int fd, unsigned int opcode, void *arg, unsigned int nr_args) { - return syscall(__NR_sys_io_uring_register, fd, opcode, arg, nr_args); + return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); }
-int io_uring_setup(unsigned entries, struct io_uring_params *p) +int io_uring_setup(unsigned int entries, struct io_uring_params *p) { - return syscall(__NR_sys_io_uring_setup, entries, p); + return syscall(__NR_io_uring_setup, entries, p); }
-int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete, - unsigned flags, sigset_t *sig) +int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, + unsigned int flags, sigset_t *sig) { - return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete, + return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig, _NSIG / 8); }