Hou Tao (5): vfs: add bare tracepoints for vfs read and release fs: add helper fs_file_read_do_trace() xfs: add trace for read and release of regular file ext4: add trace for the read and release of regular file selftests/bpf: add demo for file read pattern detection
Yufen Yu (1): readahead: introduce FMODE_CTL_WILLNEED to read first 2MB of file
ZhaoLong Wang (2): VFS: Rolling Back the fmode macro definition and structure members selftests/bpf: Update the demo file_read_pattern to run on libbpf 1.0+
fs/ext4/file.c | 4 + fs/read_write.c | 38 +++++ fs/xfs/xfs_file.c | 3 + include/linux/fs.h | 37 +++++ include/trace/events/fs.h | 33 +++++ mm/readahead.c | 40 ++++- tools/testing/selftests/bpf/Makefile | 1 + .../testing/selftests/bpf/file_read_pattern.c | 73 +++++++++ .../bpf/progs/file_read_pattern_prog.c | 138 ++++++++++++++++++ 9 files changed, 366 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/fs.h create mode 100644 tools/testing/selftests/bpf/file_read_pattern.c create mode 100644 tools/testing/selftests/bpf/progs/file_read_pattern_prog.c
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
-------------------------------------------------
In some scenario, likely spark-sql, almost all meta file's size is less then 2MB and applications read these smaller files in random mode. That means, it may issue multiple times random io to rotate disk, which can cause performance degradation.
To improve the small files random read, we try to read the first 2MB into pagecache on the first time of read. Then it can avoid multiple random io.
In fact, applications can call fadvise system with POSIX_FADV_WILLNEED to achieve this goal. But, some apps may cannot easily do that. So, we provide a new file flag FMODE_CTL_WILLNEED.
Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Conflicts: include/linux/fs.h Value '0x40000000' has been used for flag FMODE_BUF_RASYNC. Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com
Conflicts: include/linux/fs.h mm/readahead.c --- include/linux/fs.h | 7 +++++++ mm/readahead.c | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb24..931fe15dd17c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -189,6 +189,12 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async nowait buffered writes */ #define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
+/* File mode control flag, expect random access pattern */ +#define FMODE_CTL_RANDOM ((__force fmode_t)0x1) + +/* File mode control flag, will try to read head of the file into pagecache */ +#define FMODE_CTL_WILLNEED ((__force fmode_t)0x2) + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -974,6 +980,7 @@ struct file { atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; + fmode_t f_ctl_mode; struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; diff --git a/mm/readahead.c b/mm/readahead.c index 47afbca1d122..7148f6f42ed4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -132,6 +132,7 @@
#include "internal.h"
+#define READAHEAD_FIRST_SIZE (2 * 1024 * 1024) /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -682,10 +683,41 @@ static void ondemand_readahead(struct readahead_control *ractl, page_cache_ra_order(ractl, ra, order); }
+/* + * Try to read first @ra_size from head of the file. + */ +static bool page_cache_readahead_from_head(struct address_space *mapping, + struct file *filp, pgoff_t offset, + unsigned long req_size, + unsigned long ra_size) +{ + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long size = min_t(unsigned long, ra_size, + file_inode(filp)->i_size); + unsigned long nrpages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long max_pages; + unsigned int offs = 0; + + /* Cannot read date over target size, back to normal way */ + if (offset + req_size > nrpages) + return false; + + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + max_pages = min(max_pages, nrpages); + while (offs < nrpages) { + force_page_cache_readahead(mapping, filp, offs, max_pages); + offs += max_pages; + } + return true; +} + void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { - bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); + bool do_forced_ra = ractl->file && + ((ractl->file->f_mode & FMODE_RANDOM) || + (ractl->file->f_ctl_mode & FMODE_CTL_RANDOM));
/* * Even if readahead is disabled, issue this request as readahead @@ -700,6 +732,12 @@ void page_cache_sync_ra(struct readahead_control *ractl, do_forced_ra = true; }
+ /* try to read first READAHEAD_FIRST_SIZE into pagecache */ + if (ractl->file && (ractl->file->f_ctl_mode & FMODE_CTL_WILLNEED) && + page_cache_readahead_from_head(ractl->mapping, ractl->file, + ractl->_index, req_count, READAHEAD_FIRST_SIZE)) + return; + /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count);
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
---------------------------
Add a writable bare tracepoint fs_file_read() and a bare tracepoint fs_file_release().
A version field is added to fs_file_read() to support extension of fs_file_read_ctx in future.
These two tracepoints need to be exported and will be used by filesystem kernel module.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com
Conflicts: include/linux/fs.h --- fs/read_write.c | 5 +++++ include/linux/fs.h | 17 +++++++++++++++++ include/trace/events/fs.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 include/trace/events/fs.h
diff --git a/fs/read_write.c b/fs/read_write.c index a21ba3be7dbe..68bd1e644628 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -24,6 +24,8 @@
#include <linux/uaccess.h> #include <asm/unistd.h> +#define CREATE_TRACE_POINTS +#include <trace/events/fs.h>
const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, @@ -1718,3 +1720,6 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0; } + +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_release); diff --git a/include/linux/fs.h b/include/linux/fs.h index 931fe15dd17c..d29fd8eaf2f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3212,4 +3212,21 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice);
+struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_ctl_mode; + unsigned int rsvd; + /* clear from f_ctl_mode */ + unsigned int clr_f_ctl_mode; + /* set into f_ctl_mode */ + unsigned int set_f_ctl_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + #endif /* _LINUX_FS_H */ diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h new file mode 100644 index 000000000000..ee82dad9d9da --- /dev/null +++ b/include/trace/events/fs.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include <linux/types.h> +#include <linux/tracepoint.h> +#include <linux/fs.h> + +#undef FS_DECLARE_TRACE +#ifdef DECLARE_TRACE_WRITABLE +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE_WRITABLE(call, PARAMS(proto), PARAMS(args), size) +#else +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) +#endif + +FS_DECLARE_TRACE(fs_file_read, + TP_PROTO(struct fs_file_read_ctx *ctx, int version), + TP_ARGS(ctx, version), + sizeof(struct fs_file_read_ctx)); + +DECLARE_TRACE(fs_file_release, + TP_PROTO(struct inode *inode, struct file *filp), + TP_ARGS(inode, filp)); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h>
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
---------------------------
fs_file_read_do_trace() uses writable-tracepoint to update f_mode for file read procedure. Also export it to make it being usable for filesystem kernel module.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com
Conflicts: include/linux/fs.h --- fs/read_write.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 13 +++++++++++++ 2 files changed, 46 insertions(+)
diff --git a/fs/read_write.c b/fs/read_write.c index 68bd1e644628..82dc57c3bfa1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1721,5 +1721,38 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; }
+#ifdef CONFIG_TRACEPOINTS +static void fs_file_read_ctx_init(struct fs_file_read_ctx *ctx, + struct file *filp, loff_t pos) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->name = file_dentry(filp)->d_name.name; + ctx->f_ctl_mode = filp->f_ctl_mode; + ctx->key = (unsigned long)filp; + ctx->i_size = file_inode(filp)->i_size; + ctx->prev_index = filp->f_ra.prev_pos >> PAGE_SHIFT; + ctx->index = pos >> PAGE_SHIFT; +} + +#define FS_FILE_READ_VERSION 1 +#define FS_FILE_READ_MODE_MASK (FMODE_CTL_RANDOM | FMODE_CTL_WILLNEED) + +void fs_file_read_update_args_by_trace(struct kiocb *iocb) +{ + struct file *filp = iocb->ki_filp; + struct fs_file_read_ctx ctx; + + fs_file_read_ctx_init(&ctx, filp, iocb->ki_pos); + trace_fs_file_read(&ctx, FS_FILE_READ_VERSION); + + if (!ctx.set_f_ctl_mode && !ctx.clr_f_ctl_mode) + return; + + filp->f_ctl_mode |= ctx.set_f_ctl_mode & FS_FILE_READ_MODE_MASK; + filp->f_ctl_mode &= ~(ctx.clr_f_ctl_mode & FS_FILE_READ_MODE_MASK); +} +EXPORT_SYMBOL_GPL(fs_file_read_update_args_by_trace); +#endif + EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_read); EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_release); diff --git a/include/linux/fs.h b/include/linux/fs.h index d29fd8eaf2f2..bef7f5254bd9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> +#include <linux/tracepoint-defs.h>
#include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -3229,4 +3230,16 @@ struct fs_file_read_ctx { long long index; };
+#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(fs_file_read); +extern void fs_file_read_update_args_by_trace(struct kiocb *iocb); +#else +static inline void fs_file_read_update_args_by_trace(struct kiocb *iocb) {} +#endif + +static inline void fs_file_read_do_trace(struct kiocb *iocb) +{ + if (tracepoint_enabled(fs_file_read)) + fs_file_read_update_args_by_trace(iocb); +} #endif /* _LINUX_FS_H */
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
---------------------------
Use fs_file_read_do_trace() and trace_fs_file_release() to do that.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com
Conflicts: fs/xfs/xfs_file.c --- fs/xfs/xfs_file.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aede746541f8..66ab9108fefd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,6 +31,7 @@ #include <linux/mman.h> #include <linux/fadvise.h> #include <linux/mount.h> +#include <trace/events/fs.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -270,6 +271,7 @@ xfs_file_buffered_read( ssize_t ret;
trace_xfs_file_buffered_read(iocb, to); + fs_file_read_do_trace(iocb);
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) @@ -1205,6 +1207,7 @@ xfs_file_release( struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); return xfs_release(XFS_I(inode)); }
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
---------------------------
Use fs_file_read_do_trace() and trace_fs_file_release() to do that.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com --- fs/ext4/file.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d101b3b0c7da..296004b243c6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -30,6 +30,7 @@ #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> +#include <trace/events/fs.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -144,6 +145,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to);
+ fs_file_read_do_trace(iocb); return generic_file_read_iter(iocb, to); }
@@ -154,6 +156,8 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) */ static int ext4_release_file(struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
---------------------------
It attaches eBPF program into fs_file_read() and fs_file_release() respectively. The program for fs_file_read() will record read history, calculate read pattern and set f_mode for specific file, And program for fs_file_release() will clean the saved read history.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com
Conflicts: tools/testing/selftests/bpf/Makefile --- tools/testing/selftests/bpf/Makefile | 1 + .../testing/selftests/bpf/file_read_pattern.c | 73 +++++++++ .../bpf/progs/file_read_pattern_prog.c | 138 ++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 tools/testing/selftests/bpf/file_read_pattern.c create mode 100644 tools/testing/selftests/bpf/progs/file_read_pattern_prog.c
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 28d2c77262be..ab25af503d60 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -41,6 +41,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_cgroup_storage \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 +TEST_GEN_PROGS += file_read_pattern
# Also test bpf-gcc, if present ifneq ($(BPF_GCC),) diff --git a/tools/testing/selftests/bpf/file_read_pattern.c b/tools/testing/selftests/bpf/file_read_pattern.c new file mode 100644 index 000000000000..81e3a49f0424 --- /dev/null +++ b/tools/testing/selftests/bpf/file_read_pattern.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_rlimit.h" + +#define READ_TP_NAME "fs_file_read" +#define RELEASE_TP_NAME "fs_file_release" + +int main(int argc, char *argv[]) +{ + const char *name = "./file_read_pattern_prog.o"; + struct bpf_object *obj; + const char *prog_name; + struct bpf_program *prog; + int unused; + int err; + int read_fd; + int release_fd; + + err = bpf_prog_load(name, BPF_PROG_TYPE_UNSPEC, &obj, &unused); + if (err) { + printf("Failed to load program\n"); + return err; + } + + prog_name = "raw_tracepoint.w/" READ_TP_NAME; + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) { + printf("no prog %s\n", prog_name); + err = -EINVAL; + goto out; + } + + read_fd = bpf_raw_tracepoint_open(READ_TP_NAME, bpf_program__fd(prog)); + if (read_fd < 0) { + err = -errno; + printf("Failed to attach raw tracepoint %s\n", READ_TP_NAME); + goto out; + } + + prog_name = "raw_tracepoint/" RELEASE_TP_NAME; + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) { + printf("no prog %s\n", prog_name); + err = -EINVAL; + goto out; + } + + release_fd = bpf_raw_tracepoint_open(RELEASE_TP_NAME, + bpf_program__fd(prog)); + if (release_fd < 0) { + err = -errno; + printf("Failed to attach raw tracepoint %s\n", RELEASE_TP_NAME); + goto out; + } + + pause(); + + close(release_fd); + close(read_fd); +out: + bpf_object__close(obj); + return err; +} diff --git a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c new file mode 100644 index 000000000000..834a68add142 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include <stdbool.h> +#include <string.h> +#include <linux/bpf.h> + +#include <bpf/bpf_helpers.h> + +/* Need to keep consistent with definitions in include/linux/fs.h */ +#define FMODE_CTL_RANDOM 0x1 +#define FMODE_CTL_WILLNEED 0x2 + +struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_ctl_mode; + unsigned int rsvd; + /* clear from f_ctl_mode */ + unsigned int clr_f_ctl_mode; + /* set into f_ctl_mode */ + unsigned int set_f_ctl_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + +struct fs_file_read_args { + struct fs_file_read_ctx *ctx; + int version; +}; + +struct fs_file_release_args { + void *inode; + void *filp; +}; + +struct file_rd_hist { + __u64 last_nsec; + __u32 seq_nr; + __u32 tot_nr; +}; + +struct bpf_map_def SEC("maps") htab = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(struct file_rd_hist), + .max_entries = 10000, +}; + +static bool is_expected_file(void *name) +{ + char prefix[5]; + int err; + + err = bpf_probe_read_str(&prefix, sizeof(prefix), name); + if (err <= 0) + return false; + return !strncmp(prefix, "blk_", 4); +} + +SEC("raw_tracepoint.w/fs_file_read") +int fs_file_read(struct fs_file_read_args *args) +{ + const char fmt[] = "elapsed %llu, seq %u, tot %u\n"; + struct fs_file_read_ctx *rd_ctx = args->ctx; + struct file_rd_hist *hist; + struct file_rd_hist new_hist; + __u64 key; + __u64 now; + bool first; + + if (!is_expected_file((void *)rd_ctx->name)) + return 0; + + if (rd_ctx->i_size <= (4 << 20)) { + rd_ctx->set_f_ctl_mode = FMODE_CTL_WILLNEED; + return 0; + } + + first = false; + now = bpf_ktime_get_ns(); + key = rd_ctx->key; + hist = bpf_map_lookup_elem(&htab, &key); + if (!hist) { + __builtin_memset(&new_hist, 0, sizeof(new_hist)); + new_hist.last_nsec = now; + first = true; + hist = &new_hist; + } + + if (rd_ctx->index >= rd_ctx->prev_index && + rd_ctx->index - rd_ctx->prev_index <= 1) + hist->seq_nr += 1; + hist->tot_nr += 1; + + bpf_trace_printk(fmt, sizeof(fmt), now - hist->last_nsec, + hist->seq_nr, hist->tot_nr); + + if (first) { + bpf_map_update_elem(&htab, &key, hist, 0); + return 0; + } + + /* 500ms or 10 read */ + if (now - hist->last_nsec >= 500000000ULL || hist->tot_nr >= 10) { + if (hist->tot_nr >= 10) { + if (hist->seq_nr <= hist->tot_nr * 3 / 10) + rd_ctx->set_f_ctl_mode = FMODE_CTL_RANDOM; + else if (hist->seq_nr >= hist->tot_nr * 7 / 10) + rd_ctx->clr_f_ctl_mode = FMODE_CTL_RANDOM; + } + + hist->last_nsec = now; + hist->tot_nr = 0; + hist->seq_nr = 0; + } + + return 0; +} + +SEC("raw_tracepoint/fs_file_release") +int fs_file_release(struct fs_file_release_args *args) +{ + __u64 key = (unsigned long)args->filp; + void *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (value) + bpf_map_delete_elem(&htab, &key); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1;
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
-----------------------------------------
The readahead feature of the openEuler-22.03-LTS ebpf enhancement involves interface changes and needs to be compatible with the ebpf tool of openEuler-1.0-LTS. This patch changes the _ctl_mode to _mode of fs_file_read_ctx structure.
Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/read_write.c | 8 ++++---- include/linux/fs.h | 10 +++++----- .../selftests/bpf/progs/file_read_pattern_prog.c | 16 ++++++++-------- 3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c index 82dc57c3bfa1..c050dffe6a4e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1727,7 +1727,7 @@ static void fs_file_read_ctx_init(struct fs_file_read_ctx *ctx, { memset(ctx, 0, sizeof(*ctx)); ctx->name = file_dentry(filp)->d_name.name; - ctx->f_ctl_mode = filp->f_ctl_mode; + ctx->f_mode = filp->f_mode; ctx->key = (unsigned long)filp; ctx->i_size = file_inode(filp)->i_size; ctx->prev_index = filp->f_ra.prev_pos >> PAGE_SHIFT; @@ -1745,11 +1745,11 @@ void fs_file_read_update_args_by_trace(struct kiocb *iocb) fs_file_read_ctx_init(&ctx, filp, iocb->ki_pos); trace_fs_file_read(&ctx, FS_FILE_READ_VERSION);
- if (!ctx.set_f_ctl_mode && !ctx.clr_f_ctl_mode) + if (!ctx.set_f_mode && !ctx.clr_f_mode) return;
- filp->f_ctl_mode |= ctx.set_f_ctl_mode & FS_FILE_READ_MODE_MASK; - filp->f_ctl_mode &= ~(ctx.clr_f_ctl_mode & FS_FILE_READ_MODE_MASK); + filp->f_ctl_mode |= ctx.set_f_mode & FS_FILE_READ_MODE_MASK; + filp->f_ctl_mode &= ~(ctx.clr_f_mode & FS_FILE_READ_MODE_MASK); } EXPORT_SYMBOL_GPL(fs_file_read_update_args_by_trace); #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index bef7f5254bd9..21b5c6a59083 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -191,10 +191,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
/* File mode control flag, expect random access pattern */ -#define FMODE_CTL_RANDOM ((__force fmode_t)0x1) +#define FMODE_CTL_RANDOM ((__force fmode_t)0x1000)
/* File mode control flag, will try to read head of the file into pagecache */ -#define FMODE_CTL_WILLNEED ((__force fmode_t)0x2) +#define FMODE_CTL_WILLNEED ((__force fmode_t)0x400000)
/* * Attribute flags. These should be or-ed together to figure out what @@ -3215,12 +3215,12 @@ extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
struct fs_file_read_ctx { const unsigned char *name; - unsigned int f_ctl_mode; + unsigned int f_mode; unsigned int rsvd; /* clear from f_ctl_mode */ - unsigned int clr_f_ctl_mode; + unsigned int clr_f_mode; /* set into f_ctl_mode */ - unsigned int set_f_ctl_mode; + unsigned int set_f_mode; unsigned long key; /* file size */ long long i_size; diff --git a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c index 834a68add142..17c47ed63531 100644 --- a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c +++ b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c @@ -7,17 +7,17 @@ #include <bpf/bpf_helpers.h>
/* Need to keep consistent with definitions in include/linux/fs.h */ -#define FMODE_CTL_RANDOM 0x1 -#define FMODE_CTL_WILLNEED 0x2 +#define FMODE_CTL_RANDOM 0x1000 +#define FMODE_CTL_WILLNEED 0x400000
struct fs_file_read_ctx { const unsigned char *name; - unsigned int f_ctl_mode; + unsigned int f_mode; unsigned int rsvd; /* clear from f_ctl_mode */ - unsigned int clr_f_ctl_mode; + unsigned int clr_f_mode; /* set into f_ctl_mode */ - unsigned int set_f_ctl_mode; + unsigned int set_f_mode; unsigned long key; /* file size */ long long i_size; @@ -76,7 +76,7 @@ int fs_file_read(struct fs_file_read_args *args) return 0;
if (rd_ctx->i_size <= (4 << 20)) { - rd_ctx->set_f_ctl_mode = FMODE_CTL_WILLNEED; + rd_ctx->set_f_mode = FMODE_CTL_WILLNEED; return 0; }
@@ -108,9 +108,9 @@ int fs_file_read(struct fs_file_read_args *args) if (now - hist->last_nsec >= 500000000ULL || hist->tot_nr >= 10) { if (hist->tot_nr >= 10) { if (hist->seq_nr <= hist->tot_nr * 3 / 10) - rd_ctx->set_f_ctl_mode = FMODE_CTL_RANDOM; + rd_ctx->set_f_mode = FMODE_CTL_RANDOM; else if (hist->seq_nr >= hist->tot_nr * 7 / 10) - rd_ctx->clr_f_ctl_mode = FMODE_CTL_RANDOM; + rd_ctx->clr_f_mode = FMODE_CTL_RANDOM; }
hist->last_nsec = now;
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7Y9JD CVE: NA
-----------------------------------------
Some APIs are removed and some APIs are changed from libbpf 1.0 or later. The purpose of this patch is to enable the demo to run successfully on libbpf 1.0+.
For details about API changes, see the following link: https://github.com/libbpf/libbpf/wiki/Libbpf:-the-road-to-v1.0 https://github.com/libbpf/libbpf/wiki/Libbpf-1.0-migration-guide
Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com --- .../testing/selftests/bpf/file_read_pattern.c | 30 +++++++++---------- .../bpf/progs/file_read_pattern_prog.c | 12 ++++---- 2 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/tools/testing/selftests/bpf/file_read_pattern.c b/tools/testing/selftests/bpf/file_read_pattern.c index 81e3a49f0424..54d1f869bf04 100644 --- a/tools/testing/selftests/bpf/file_read_pattern.c +++ b/tools/testing/selftests/bpf/file_read_pattern.c @@ -10,32 +10,33 @@ #include <bpf/bpf.h> #include <bpf/libbpf.h>
-#include "bpf_rlimit.h" - #define READ_TP_NAME "fs_file_read" #define RELEASE_TP_NAME "fs_file_release"
int main(int argc, char *argv[]) { - const char *name = "./file_read_pattern_prog.o"; + const char *name = "./file_read_pattern_prog.bpf.o"; struct bpf_object *obj; - const char *prog_name; struct bpf_program *prog; - int unused; - int err; + int err = 0; int read_fd; int release_fd;
- err = bpf_prog_load(name, BPF_PROG_TYPE_UNSPEC, &obj, &unused); - if (err) { - printf("Failed to load program\n"); + obj = bpf_object__open_file(name, NULL); + if (!obj) { + printf("Failed to open program: %s\n", name); return err; }
- prog_name = "raw_tracepoint.w/" READ_TP_NAME; - prog = bpf_object__find_program_by_title(obj, prog_name); + err = bpf_object__load(obj); + if (err) { + printf("failed to load program: %s\n", name); + goto out; + } + + prog = bpf_object__find_program_by_name(obj, READ_TP_NAME); if (!prog) { - printf("no prog %s\n", prog_name); + printf("no prog %s\n", READ_TP_NAME); err = -EINVAL; goto out; } @@ -47,10 +48,9 @@ int main(int argc, char *argv[]) goto out; }
- prog_name = "raw_tracepoint/" RELEASE_TP_NAME; - prog = bpf_object__find_program_by_title(obj, prog_name); + prog = bpf_object__find_program_by_name(obj, RELEASE_TP_NAME); if (!prog) { - printf("no prog %s\n", prog_name); + printf("no prog %s\n", RELEASE_TP_NAME); err = -EINVAL; goto out; } diff --git a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c index 17c47ed63531..3772275eefcc 100644 --- a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c +++ b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c @@ -43,12 +43,12 @@ struct file_rd_hist { __u32 tot_nr; };
-struct bpf_map_def SEC("maps") htab = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(struct file_rd_hist), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, struct file_rd_hist); + __uint(max_entries, 10000); +} htab SEC(".maps");
static bool is_expected_file(void *name) {
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/2001 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/U...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/2001 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/U...