From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
In some scenario, likely spark-sql, almost all meta file's size is less then 2MB and applications read these smaller files in random mode. That means, it may issue multiple times random io to rotate disk, which can cause performance degradation.
To improve the small files random read, we try to read the first 2MB into pagecache on the first time of read. Then it can avoid multiple random io.
In fact, applications can call fadvise system with POSIX_FADV_WILLNEED to achieve this goal. But, some apps may cannot easily do that. So, we provide a new file flag FMODE_WILLNEED.
Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/fs.h | 3 +++ mm/readahead.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+)
diff --git a/include/linux/fs.h b/include/linux/fs.h index 36d828c741d5c..05c85ee240aff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -169,6 +169,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
+/* File will try to read head of the file into pagecache */ +#define FMODE_WILLNEED ((__force fmode_t)0x40000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are diff --git a/mm/readahead.c b/mm/readahead.c index 205ac348bb4ae..89da1e7f0aee0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -24,6 +24,7 @@
#include "internal.h"
+#define READAHEAD_FIRST_SIZE (2 * 1024 * 1024) /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -491,6 +492,35 @@ ondemand_readahead(struct address_space *mapping, return ra_submit(ra, mapping, filp); }
+/* + * Try to read first @ra_size from head of the file. + */ +static bool page_cache_readahead_from_head(struct address_space *mapping, + struct file *filp, pgoff_t offset, + unsigned long req_size, + unsigned long ra_size) +{ + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long size = min_t(unsigned long, ra_size, + file_inode(filp)->i_size); + unsigned long nrpages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long max_pages; + unsigned int offs = 0; + + /* Cannot read date over target size, back to normal way */ + if (offset + req_size > nrpages) + return false; + + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + max_pages = min(max_pages, nrpages); + while (offs < nrpages) { + force_page_cache_readahead(mapping, filp, offs, max_pages); + offs += max_pages; + } + return true; +} + /** * page_cache_sync_readahead - generic file readahead * @mapping: address_space which holds the pagecache and I/O vectors @@ -516,6 +546,12 @@ void page_cache_sync_readahead(struct address_space *mapping, if (blk_cgroup_congested()) return;
+ /* try to read first READAHEAD_FIRST_SIZE into pagecache */ + if (filp && (filp->f_mode & FMODE_WILLNEED) && + page_cache_readahead_from_head(mapping, filp, + offset, req_size, READAHEAD_FIRST_SIZE)) + return; + /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { force_page_cache_readahead(mapping, filp, offset, req_size);
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
This patch adds a new struct xfs_writable_file into file uapi/include/linux/xfs.h and adds a tracepoint that can modify struct xfs_writable_file. Then we can set file->f_mode in ebpf program to control the file.
Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_file.c | 13 +++++++++++- fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_trace.h | 46 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/xfs.h | 15 +++++++++++++ 4 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/xfs.h
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1b2eb9d055ba0..ffc388c8b4523 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -232,10 +232,21 @@ xfs_file_buffered_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + struct file *filp = iocb->ki_filp; + struct xfs_inode *ip = XFS_I(file_inode(filp)); ssize_t ret; + struct xfs_writable_file file; + + file.name = file_dentry(filp)->d_name.name; + file.f_mode = 0; + file.i_size = file_inode(filp)->i_size; + file.prev_pos = filp->f_ra.prev_pos;
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + trace_xfs_file_read(&file, ip, iov_iter_count(to), iocb->ki_pos); + + if (file.f_mode) + filp->f_mode |= file.f_mode;
if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index edbd5a210df22..086173507c271 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -60,6 +60,7 @@ typedef __u32 xfs_nlink_t; #include <linux/list_sort.h> #include <linux/ratelimit.h> #include <linux/rhashtable.h> +#include <uapi/linux/xfs.h>
#include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a3e7813778b06..678a02d145445 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3371,6 +3371,52 @@ TRACE_EVENT(xfs_iunlink_update_bucket, __entry->new_ptr) );
+DECLARE_EVENT_CLASS(xfs_file_read, + TP_PROTO(struct xfs_writable_file *file, struct xfs_inode *ip, + size_t count, loff_t offset), + TP_ARGS(file, ip, count, offset), + TP_STRUCT__entry( + __field(struct xfs_writable_file *, file) + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, offset) + __field(size_t, count) + ), + TP_fast_assign( + __entry->file = 0; + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx offset 0x%llx count 0x%zx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->offset, + __entry->count) +); + +#ifdef DEFINE_EVENT_WRITABLE +#undef XFS_DEFINE_EVENT +#define XFS_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ + PARAMS(args), size) +#else +#undef XFS_DEFINE_EVENT +#define XFS_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) +#endif + +XFS_DEFINE_EVENT(xfs_file_read, xfs_file_read, + + TP_PROTO(struct xfs_writable_file *file, struct xfs_inode *ip, + size_t count, loff_t offset), + + TP_ARGS(file, ip, count, offset), + + sizeof(struct xfs_writable_file) +); + #endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH diff --git a/include/uapi/linux/xfs.h b/include/uapi/linux/xfs.h new file mode 100644 index 0000000000000..635a83914273b --- /dev/null +++ b/include/uapi/linux/xfs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_XFS_H +#define _UAPI_LINUX_XFS_H + +#include <linux/types.h> + +struct xfs_writable_file { + const unsigned char *name; + unsigned int f_mode; /* can be set into file->f_mode */ + long long i_size; /* file size */ + long long prev_pos; /* ra->prev_pos */ +}; + +#endif /* _UAPI_LINUX_XFS_H */
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
Then, we can use bpf_probe_read_str() into ebpf program.
Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/testing/selftests/bpf/bpf_helpers.h | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index e4be7730222df..556688e820264 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -18,6 +18,8 @@ static int (*bpf_map_delete_elem)(void *map, void *key) = (void *) BPF_FUNC_map_delete_elem; static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = (void *) BPF_FUNC_probe_read; +static int (*bpf_probe_read_str)(void *dst, int size, void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read_str; static unsigned long long (*bpf_ktime_get_ns)(void) = (void *) BPF_FUNC_ktime_get_ns; static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
Usage: cd tools/testing/selftests make -C bpf -j ./test_xfs_file # running
Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/include/uapi/linux/xfs.h | 17 +++++++ tools/testing/selftests/bpf/Makefile | 5 +- .../testing/selftests/bpf/test_set_xfs_file.c | 40 +++++++++++++++ tools/testing/selftests/bpf/test_xfs_file.c | 51 +++++++++++++++++++ 4 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 tools/include/uapi/linux/xfs.h create mode 100644 tools/testing/selftests/bpf/test_set_xfs_file.c create mode 100644 tools/testing/selftests/bpf/test_xfs_file.c
diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h new file mode 100644 index 0000000000000..f333a2eb74074 --- /dev/null +++ b/tools/include/uapi/linux/xfs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_XFS_H +#define _UAPI_LINUX_XFS_H + +#include <linux/types.h> + +#define FMODE_RANDOM (0x1000) +#define FMODE_WILLNEED (0x40000000) + +struct xfs_writable_file { + const unsigned char *name; + unsigned int f_mode; /* can be set into file->f_mode */ + long long i_size; /* file size */ + long long prev_pos; /* ra->prev_pos */ +}; + +#endif /* _UAPI_LINUX_XFS_H */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f3f874ba186bb..1ba656a8ed656 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -23,7 +23,8 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ - test_socket_cookie test_cgroup_storage test_select_reuseport + test_socket_cookie test_cgroup_storage test_select_reuseport test_xfs_file +
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -35,7 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o + test_skb_cgroup_id_kern.o test_set_xfs_file.o
# Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/test_set_xfs_file.c b/tools/testing/selftests/bpf/test_set_xfs_file.c new file mode 100644 index 0000000000000..0b289bbc3985d --- /dev/null +++ b/tools/testing/selftests/bpf/test_set_xfs_file.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include "bpf_helpers.h" +#include <string.h> +#include <linux/xfs.h> + +/* from /sys/kernel/debug/tracing/events/xfs/xfs_read_file */ +struct xfs_read_buffer_args { + struct xfs_writable_file *file; +}; + +SEC("tracepoint/xfs/xfs_file_read") +int bpf_prog1(struct xfs_read_buffer_args *ctx) +{ + char fmt[] = "name: %s, f_mode:%d, i_size:%lu\n"; + struct xfs_writable_file *file = ctx->file; + char name[64] = {}; + char *tmp; + unsigned long i_size; + int len; + + bpf_probe_read(&tmp, 8, &(file->name)); + len = bpf_probe_read_str(name, 64, tmp); + bpf_probe_read(&i_size, 8, &(file->i_size)); + + if (!strncmp("blk_", name, 4)) { + /* blk_xxx.meta or blk_xxx with size < 2M */ + if (len == 27 || (len == 15 && i_size <= 2 * 1024 * 1024)) + file->f_mode |= FMODE_WILLNEED; + /* blk_xxx */ + else if (len == 15) + file->f_mode |= FMODE_RANDOM; + bpf_trace_printk(fmt, sizeof(fmt), name, file->f_mode, i_size); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/test_xfs_file.c b/tools/testing/selftests/bpf/test_xfs_file.c new file mode 100644 index 0000000000000..d0bc971d93bf9 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xfs_file.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <assert.h> +#include <sys/time.h> +#include <unistd.h> + +#include <linux/bpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_rlimit.h" + +#define SLEEP_SECS 9999999 + +int main(int argc, char *argv[]) +{ + const char *file = "./test_set_xfs_file.o"; + struct bpf_object *obj; + int efd, err, prog_fd; + int delay = SLEEP_SECS; + char *endptr, *str; + + if (argc == 2) { + str = argv[1]; + delay = strtol(str, &endptr, 10); + } + + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, &obj, + &prog_fd); + if (err) { + printf("Failed to load xfs program\n"); + goto out; + } + + efd = bpf_raw_tracepoint_open("xfs_file_read", prog_fd); + if (efd < 0) { + printf("Fail to open tracepoint, efd %d\n", efd); + goto out; + } + + sleep(delay); + + printf("END\n"); + +out: + return err; +}