From: Prasad Singamsetty prasad.singamsetty@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VTE3 CVE: NA
Reference: https://lore.kernel.org/all/20240326133813.3224593-4-john.g.garry@oracle.com...
--------------------------------
An atomic write is a write issued with torn-write protection, meaning that for a power failure or any other hardware failure, all or none of the data from the write will be stored, but never a mix of old and new data.
Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the write is to be issued with torn-write prevention, according to special alignment and length rules.
For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for iocb->ki_flags field to indicate the same.
A call to statx will give the relevant atomic write info for a file: - atomic_write_unit_min - atomic_write_unit_max - atomic_write_segments_max
Both min and max values must be a power-of-2.
Applications can avail of atomic write feature by ensuring that the total length of a write is a power-of-2 in size and also sized between atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications must ensure that the write is at a naturally-aligned offset in the file wrt the total write length. The value in atomic_write_segments_max indicates the upper limit for IOV_ITER iovcnt.
Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the flag set will have RWF_ATOMIC rejected and not just ignored.
Add a type argument to kiocb_set_rw_flags() to allows reads which have RWF_ATOMIC set to be rejected.
Helper function generic_atomic_write_valid() can be used by FSes to verify compliant writes. There we check for iov_iter type is for ubuf, which implies iovcnt==1 for pwritev2(), which is an initial restriction for atomic_write_segments_max. Initially the only user will be bdev file operations write handler. We will rely on the block BIO submission path to ensure write sizes are compliant for the bdev, so we don't need to check atomic writes sizes yet.
Signed-off-by: Prasad Singamsetty prasad.singamsetty@oracle.com jpg: merge into single patch and much rewrite Signed-off-by: John Garry john.g.garry@oracle.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/aio.c | 8 ++++---- fs/read_write.c | 2 +- include/linux/fs.h | 30 +++++++++++++++++++++++++++++- include/uapi/linux/fs.h | 5 ++++- io_uring/io_uring.c | 2 +- 5 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 00641a1ad0b3..78aaeaf35436 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1458,7 +1458,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) iocb_put(iocb); }
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret;
@@ -1485,7 +1485,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) } else req->ki_ioprio = get_current_ioprio();
- ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret;
@@ -1537,7 +1537,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret;
- ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; @@ -1565,7 +1565,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret;
- ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; diff --git a/fs/read_write.c b/fs/read_write.c index 371a5a76f30e..da03b3e65cf3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -726,7 +726,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ssize_t ret;
init_sync_kiocb(&kiocb, filp); - ret = kiocb_set_rw_flags(&kiocb, flags); + ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); diff --git a/include/linux/fs.h b/include/linux/fs.h index 382a0d4dd3dd..9d7e901b71fd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -184,6 +184,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async buffered reads */ #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
+/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)0x80000000) + /* File mode control flag, expect random access pattern */ #define FMODE_CTL_RANDOM ((__force fmode_t)0x1000)
@@ -320,6 +323,7 @@ enum rw_hint { #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC
/* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -3406,7 +3410,8 @@ static inline int iocb_flags(struct file *file) return res; }
-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0;
@@ -3423,6 +3428,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3665,4 +3676,21 @@ static inline void fs_file_read_do_trace(struct kiocb *iocb) if (tracepoint_enabled(fs_file_read)) fs_file_read_update_args_by_trace(iocb); } + +static inline +bool generic_atomic_write_valid(loff_t pos, size_t len, + unsigned int unit_min, unsigned int unit_max) +{ + if (len < unit_min || len > unit_max) + return false; + + if (!is_power_of_2(len)) + return false; + + if (!IS_ALIGNED(pos, len)) + return false; + + return true; +} + #endif /* _LINUX_FS_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f44eb0a04afd..78f4091ed188 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -300,8 +300,11 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010)
+/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_ATOMIC)
#endif /* _UAPI_LINUX_FS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 65cf70874fb3..c284e9865826 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2956,7 +2956,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags), rw); if (unlikely(ret)) return ret;