From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: 173267 CVE: NA ---------------------------
Adding a new member clear_f_mode into struct xfs_writable_file, then we can clear some flag of file->f_mode.
Signed-off-by: Yufen Yu yuyufen@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_file.c | 10 +++++++--- include/linux/fs.h | 9 ++++++--- include/uapi/linux/xfs.h | 4 +++- tools/include/uapi/linux/xfs.h | 2 +- 4 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ffc388c8b4523..bd8ae4df20042 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -35,6 +35,8 @@ #include <linux/mman.h> #include <linux/fadvise.h>
+#define FMODE_MASK (FMODE_RANDOM | FMODE_WILLNEED | FMODE_SPC_READAHEAD) + static const struct vm_operations_struct xfs_file_vm_ops;
int @@ -238,15 +240,17 @@ xfs_file_buffered_aio_read( struct xfs_writable_file file;
file.name = file_dentry(filp)->d_name.name; + file.clear_f_mode = 0; file.f_mode = 0; file.i_size = file_inode(filp)->i_size; - file.prev_pos = filp->f_ra.prev_pos; + file.prev_pos = filp->f_ra.prev_pos >> PAGE_SHIFT; + file.pos = iocb->ki_pos >> PAGE_SHIFT;
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); trace_xfs_file_read(&file, ip, iov_iter_count(to), iocb->ki_pos);
- if (file.f_mode) - filp->f_mode |= file.f_mode; + filp->f_mode |= file.f_mode & FMODE_MASK; + filp->f_mode &= ~(file.clear_f_mode & FMODE_MASK);
if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) diff --git a/include/linux/fs.h b/include/linux/fs.h index f5bc43ac95035..394da46d143c2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -160,6 +160,12 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)0x200000)
+/* File will try to read head of the file into pagecache */ +#define FMODE_WILLNEED ((__force fmode_t)0x400000) + +/* File will do specail readahead */ +#define FMODE_SPC_READAHEAD ((__force fmode_t)0x800000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
@@ -169,9 +175,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
-/* File will try to read head of the file into pagecache */ -#define FMODE_WILLNEED ((__force fmode_t)0x40000000) - /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are diff --git a/include/uapi/linux/xfs.h b/include/uapi/linux/xfs.h index 635a83914273b..0a11c2344e5a3 100644 --- a/include/uapi/linux/xfs.h +++ b/include/uapi/linux/xfs.h @@ -7,9 +7,11 @@
struct xfs_writable_file { const unsigned char *name; + unsigned int clear_f_mode; /* can be cleared from file->f_mode */ unsigned int f_mode; /* can be set into file->f_mode */ long long i_size; /* file size */ - long long prev_pos; /* ra->prev_pos */ + long long prev_pos; /* ra->prev_pos page index */ + long long pos; /* iocb->ki_pos page index */ };
#endif /* _UAPI_LINUX_XFS_H */ diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h index f333a2eb74074..2c4c61d5ba539 100644 --- a/tools/include/uapi/linux/xfs.h +++ b/tools/include/uapi/linux/xfs.h @@ -5,7 +5,7 @@ #include <linux/types.h>
#define FMODE_RANDOM (0x1000) -#define FMODE_WILLNEED (0x40000000) +#define FMODE_WILLNEED (0x400000)
struct xfs_writable_file { const unsigned char *name;
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: 173267 CVE: NA ---------------------------
If ra->prev_pos page index is equal to current pos, that means it is sequential read, then clear FMODE_RANDOM flag to enable async readahead.
Usage: make -C bpf ./test_xfs_file clear
Signed-off-by: Yufen Yu yuyufen@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/include/uapi/linux/xfs.h | 4 +- tools/testing/selftests/bpf/Makefile | 2 +- .../selftests/bpf/test_clear_xfs_file.c | 43 +++++++++++++++++++ tools/testing/selftests/bpf/test_xfs_file.c | 11 +++-- 4 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_clear_xfs_file.c
diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h index 2c4c61d5ba539..a0d37e411ee18 100644 --- a/tools/include/uapi/linux/xfs.h +++ b/tools/include/uapi/linux/xfs.h @@ -9,9 +9,11 @@
struct xfs_writable_file { const unsigned char *name; + unsigned int clear_f_mode; /* can be cleared from file->f_mode */ unsigned int f_mode; /* can be set into file->f_mode */ long long i_size; /* file size */ - long long prev_pos; /* ra->prev_pos */ + long long prev_pos; /* ra->prev_pos page index */ + long long pos; /* iocb->ki_pos page index */ };
#endif /* _UAPI_LINUX_XFS_H */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1ba656a8ed656..8d2737285f185 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,7 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o test_set_xfs_file.o + test_skb_cgroup_id_kern.o test_set_xfs_file.o test_clear_xfs_file.o
# Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/test_clear_xfs_file.c b/tools/testing/selftests/bpf/test_clear_xfs_file.c new file mode 100644 index 0000000000000..167ee63207232 --- /dev/null +++ b/tools/testing/selftests/bpf/test_clear_xfs_file.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include "bpf_helpers.h" +#include <string.h> +#include <linux/xfs.h> + +/* from /sys/kernel/debug/tracing/events/xfs/xfs_read_file */ +struct xfs_read_buffer_args { + struct xfs_writable_file *file; +}; + +SEC("tracepoint/xfs/xfs_file_read") +int bpf_prog1(struct xfs_read_buffer_args *ctx) +{ + char fmt[] = "name: %s, clear_f_mode: %u, f_mode: %u\n"; + struct xfs_writable_file *file = ctx->file; + char name[64] = {}; + char *tmp; + unsigned long i_size; + int len; + + bpf_probe_read(&tmp, 8, &(file->name)); + len = bpf_probe_read_str(name, 64, tmp); + bpf_probe_read(&i_size, 8, &(file->i_size)); + + if (!strncmp("blk_", name, 4)) { + /* blk_xxx.meta or blk_xxx with size < 2M */ + if (len == 27 || (len == 15 && i_size <= 2 * 1024 * 1024)) { + file->f_mode |= FMODE_WILLNEED; + /* blk_xxx */ + } else if (len == 15) { + if (file->prev_pos == file->pos) + file->clear_f_mode |= FMODE_RANDOM; + } + bpf_trace_printk(fmt, sizeof(fmt), name, file->clear_f_mode, + file->f_mode); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/test_xfs_file.c b/tools/testing/selftests/bpf/test_xfs_file.c index d0bc971d93bf9..247c42be029b2 100644 --- a/tools/testing/selftests/bpf/test_xfs_file.c +++ b/tools/testing/selftests/bpf/test_xfs_file.c @@ -18,17 +18,22 @@
int main(int argc, char *argv[]) { - const char *file = "./test_set_xfs_file.o"; + const char *set_file = "./test_set_xfs_file.o"; + const char *clear_file = "./test_clear_xfs_file.o"; + const char *file = set_file; struct bpf_object *obj; int efd, err, prog_fd; int delay = SLEEP_SECS; char *endptr, *str;
- if (argc == 2) { - str = argv[1]; + if (argc == 3) { + str = argv[2]; delay = strtol(str, &endptr, 10); }
+ if (argc >= 2 && !strcmp("clear", argv[1])) + file = clear_file; + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, &obj, &prog_fd); if (err) {
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: 173267 CVE: NA ---------------------------
For hibench applications, include kmeans, wordcount and terasort, they will read whole blk_xxx and blk_xxx.meta from disk in sequential. And almost all of the read issued to disk are triggered by async readahead.
While sequential read of single thread does't means sequential io on disk when multiple threads cocurrently do that. Multiple threads interleaving sequentail read can make io issued into disk become random, which will limit disk IO throughput.
To reduce disk randomization, we can consider to increase readahead window. Then IO generated by filesystem will be bigger in each time of async readahead. But, limited by disk max_hw_sectors_kb, big IO will be split and the whole bio need to wait all split bios complete, which can cause longer io latency.
Our trace shows that many long latency in threads are caused by waiting async readahead IO complete when set readahead window with a big value. That means, thread read read speed is faster than async readahead io complete.
To improve performance, we try to provide a special async readahead method:
* On the one hand, we try to read more sequential data from disk, which can reduce disk randomization when multiple thread interleaving.
* On the other hand, size of each IO issued to disk is 2M, which can avoid big IO split and long io latency.
Signed-off-by: Yufen Yu yuyufen@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/readahead.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+)
diff --git a/mm/readahead.c b/mm/readahead.c index 89da1e7f0aee0..7a21199c6227d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -25,6 +25,9 @@ #include "internal.h"
#define READAHEAD_FIRST_SIZE (2 * 1024 * 1024) +#define READAHEAD_MIN_SIZE (2 * 1024 * 1024) +#define READAHEAD_ASYNC_RATIO 2 +#define FILE_READAHEAD_TIMES 4 /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -563,6 +566,30 @@ void page_cache_sync_readahead(struct address_space *mapping, } EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+static void do_special_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp) +{ + loff_t isize = i_size_read(file_inode(filp)); + unsigned long nrpages = DIV_ROUND_UP(isize, PAGE_SIZE); + unsigned long size = DIV_ROUND_UP(nrpages, FILE_READAHEAD_TIMES); + unsigned int each_ra_size = READAHEAD_MIN_SIZE / PAGE_SIZE; + unsigned long set_page_readahead = size / READAHEAD_ASYNC_RATIO; + + while (size > 0) { + if (ra->start + ra->size > nrpages) + break; + ra->start += ra->size; + ra->size = each_ra_size; + /* SetPageReadahead to do next async readahead */ + if (size == set_page_readahead) + ra->async_size = ra->size; + else + ra->async_size = 0; + ra_submit(ra, mapping, filp); + size -= min_t(unsigned long, size, each_ra_size); + } +} + /** * page_cache_async_readahead - file readahead for marked pages * @mapping: address_space which holds the pagecache and I/O vectors @@ -605,6 +632,11 @@ page_cache_async_readahead(struct address_space *mapping, if (blk_cgroup_congested()) return;
+ if (filp && (filp->f_mode & FMODE_SPC_READAHEAD)) { + do_special_async_readahead(mapping, ra, filp); + return; + } + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); }
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: 173267 CVE: NA ---------------------------
For hibench applications, likely kmeans, wordcount, terasort, we can try to use this bpf tool to improve io performance.
Usage: make -C bpf ./test_xfs_file spec_readahead
Signed-off-by: Yufen Yu yuyufen@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/include/uapi/linux/xfs.h | 1 + tools/testing/selftests/bpf/Makefile | 3 +- .../bpf/test_spec_readahead_xfs_file.c | 39 +++++++++++++++++++ tools/testing/selftests/bpf/test_xfs_file.c | 9 ++++- 4 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c
diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h index a0d37e411ee18..1409b45affd34 100644 --- a/tools/include/uapi/linux/xfs.h +++ b/tools/include/uapi/linux/xfs.h @@ -6,6 +6,7 @@
#define FMODE_RANDOM (0x1000) #define FMODE_WILLNEED (0x400000) +#define FMODE_SPC_READAHEAD (0x800000)
struct xfs_writable_file { const unsigned char *name; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8d2737285f185..46b1d5b864f5a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,7 +36,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o test_set_xfs_file.o test_clear_xfs_file.o + test_skb_cgroup_id_kern.o test_set_xfs_file.o test_clear_xfs_file.o \ + test_spec_readahead_xfs_file.o
# Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c b/tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c new file mode 100644 index 0000000000000..ff8794a14cdcd --- /dev/null +++ b/tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include "bpf_helpers.h" +#include <string.h> +#include <linux/xfs.h> + +/* from /sys/kernel/debug/tracing/events/xfs/xfs_read_file */ +struct xfs_read_buffer_args { + struct xfs_writable_file *file; +}; + +SEC("tracepoint/xfs/xfs_file_read") +int bpf_prog1(struct xfs_read_buffer_args *ctx) +{ + char fmt[] = "name: %s, set f_mode: %u\n"; + struct xfs_writable_file *file = ctx->file; + char name[64] = {}; + char *tmp; + unsigned long i_size; + int len; + + bpf_probe_read(&tmp, 8, &(file->name)); + len = bpf_probe_read_str(name, 64, tmp); + bpf_probe_read(&i_size, 8, &(file->i_size)); + + if (!strncmp("blk_", name, 4)) { + /* blk_xxx.meta or blk_xxx with size < 2M */ + if (len == 27 || (len == 15 && i_size <= 2 * 1024 * 1024)) + file->f_mode |= FMODE_WILLNEED; + else if (len == 15) /* blk_xxx */ + file->f_mode |= FMODE_SPC_READAHEAD; + bpf_trace_printk(fmt, sizeof(fmt), name, file->f_mode); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/test_xfs_file.c b/tools/testing/selftests/bpf/test_xfs_file.c index 247c42be029b2..89e79d959677c 100644 --- a/tools/testing/selftests/bpf/test_xfs_file.c +++ b/tools/testing/selftests/bpf/test_xfs_file.c @@ -20,6 +20,7 @@ int main(int argc, char *argv[]) { const char *set_file = "./test_set_xfs_file.o"; const char *clear_file = "./test_clear_xfs_file.o"; + const char *spec_readahead_file = "./test_spec_readahead_xfs_file.o"; const char *file = set_file; struct bpf_object *obj; int efd, err, prog_fd; @@ -31,8 +32,12 @@ int main(int argc, char *argv[]) delay = strtol(str, &endptr, 10); }
- if (argc >= 2 && !strcmp("clear", argv[1])) - file = clear_file; + if (argc >= 2) { + if (!strcmp("clear", argv[1])) + file = clear_file; + if (!strcmp("spec_readahead", argv[1])) + file = spec_readahead_file; + }
err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, &obj, &prog_fd);