From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
There are two issues with the current solution: 1) tracepoint xfs_read_file is visible in tracefs It forms an ABI for userspace. It is bad because new field may be added into xfs_writable_file to export more information to userspace.
2) tracepoint xfs_read_file is specific to xfs HDFS can be stacked on ext4.
A new solution is proposed which uses vfs bare tracepoint, so reverts commit 69513cfbe62d267c4a5e6025f31741b1f2cb946c.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/include/uapi/linux/xfs.h | 1 - tools/testing/selftests/bpf/Makefile | 3 +- .../bpf/test_spec_readahead_xfs_file.c | 39 ------------------- tools/testing/selftests/bpf/test_xfs_file.c | 9 +---- 4 files changed, 3 insertions(+), 49 deletions(-) delete mode 100644 tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c
diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h index 1409b45affd34..a0d37e411ee18 100644 --- a/tools/include/uapi/linux/xfs.h +++ b/tools/include/uapi/linux/xfs.h @@ -6,7 +6,6 @@
#define FMODE_RANDOM (0x1000) #define FMODE_WILLNEED (0x400000) -#define FMODE_SPC_READAHEAD (0x800000)
struct xfs_writable_file { const unsigned char *name; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 46b1d5b864f5a..8d2737285f185 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,8 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o test_set_xfs_file.o test_clear_xfs_file.o \ - test_spec_readahead_xfs_file.o + test_skb_cgroup_id_kern.o test_set_xfs_file.o test_clear_xfs_file.o
# Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c b/tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c deleted file mode 100644 index ff8794a14cdcd..0000000000000 --- a/tools/testing/selftests/bpf/test_spec_readahead_xfs_file.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/bpf.h> -#include "bpf_helpers.h" -#include <string.h> -#include <linux/xfs.h> - -/* from /sys/kernel/debug/tracing/events/xfs/xfs_read_file */ -struct xfs_read_buffer_args { - struct xfs_writable_file *file; -}; - -SEC("tracepoint/xfs/xfs_file_read") -int bpf_prog1(struct xfs_read_buffer_args *ctx) -{ - char fmt[] = "name: %s, set f_mode: %u\n"; - struct xfs_writable_file *file = ctx->file; - char name[64] = {}; - char *tmp; - unsigned long i_size; - int len; - - bpf_probe_read(&tmp, 8, &(file->name)); - len = bpf_probe_read_str(name, 64, tmp); - bpf_probe_read(&i_size, 8, &(file->i_size)); - - if (!strncmp("blk_", name, 4)) { - /* blk_xxx.meta or blk_xxx with size < 2M */ - if (len == 27 || (len == 15 && i_size <= 2 * 1024 * 1024)) - file->f_mode |= FMODE_WILLNEED; - else if (len == 15) /* blk_xxx */ - file->f_mode |= FMODE_SPC_READAHEAD; - bpf_trace_printk(fmt, sizeof(fmt), name, file->f_mode); - } - return 0; -} - -char _license[] SEC("license") = "GPL"; -__u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/test_xfs_file.c b/tools/testing/selftests/bpf/test_xfs_file.c index 89e79d959677c..247c42be029b2 100644 --- a/tools/testing/selftests/bpf/test_xfs_file.c +++ b/tools/testing/selftests/bpf/test_xfs_file.c @@ -20,7 +20,6 @@ int main(int argc, char *argv[]) { const char *set_file = "./test_set_xfs_file.o"; const char *clear_file = "./test_clear_xfs_file.o"; - const char *spec_readahead_file = "./test_spec_readahead_xfs_file.o"; const char *file = set_file; struct bpf_object *obj; int efd, err, prog_fd; @@ -32,12 +31,8 @@ int main(int argc, char *argv[]) delay = strtol(str, &endptr, 10); }
- if (argc >= 2) { - if (!strcmp("clear", argv[1])) - file = clear_file; - if (!strcmp("spec_readahead", argv[1])) - file = spec_readahead_file; - } + if (argc >= 2 && !strcmp("clear", argv[1])) + file = clear_file;
err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, &obj, &prog_fd);
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
There are two issues with the current solution: 1) tracepoint xfs_read_file is visible in tracefs It forms an ABI for userspace. It is bad because new field may be added into xfs_writable_file to export more information to userspace.
2) tracepoint xfs_read_file is specific to xfs HDFS can be stacked on ext4.
This reverts commit 38abc1bb82c913b6548067195beb5894d994bd5d.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/include/uapi/linux/xfs.h | 4 +- tools/testing/selftests/bpf/Makefile | 2 +- .../selftests/bpf/test_clear_xfs_file.c | 43 ------------------- tools/testing/selftests/bpf/test_xfs_file.c | 11 ++--- 4 files changed, 5 insertions(+), 55 deletions(-) delete mode 100644 tools/testing/selftests/bpf/test_clear_xfs_file.c
diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h index a0d37e411ee18..2c4c61d5ba539 100644 --- a/tools/include/uapi/linux/xfs.h +++ b/tools/include/uapi/linux/xfs.h @@ -9,11 +9,9 @@
struct xfs_writable_file { const unsigned char *name; - unsigned int clear_f_mode; /* can be cleared from file->f_mode */ unsigned int f_mode; /* can be set into file->f_mode */ long long i_size; /* file size */ - long long prev_pos; /* ra->prev_pos page index */ - long long pos; /* iocb->ki_pos page index */ + long long prev_pos; /* ra->prev_pos */ };
#endif /* _UAPI_LINUX_XFS_H */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8d2737285f185..1ba656a8ed656 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,7 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o test_set_xfs_file.o test_clear_xfs_file.o + test_skb_cgroup_id_kern.o test_set_xfs_file.o
# Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/test_clear_xfs_file.c b/tools/testing/selftests/bpf/test_clear_xfs_file.c deleted file mode 100644 index 167ee63207232..0000000000000 --- a/tools/testing/selftests/bpf/test_clear_xfs_file.c +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/bpf.h> -#include "bpf_helpers.h" -#include <string.h> -#include <linux/xfs.h> - -/* from /sys/kernel/debug/tracing/events/xfs/xfs_read_file */ -struct xfs_read_buffer_args { - struct xfs_writable_file *file; -}; - -SEC("tracepoint/xfs/xfs_file_read") -int bpf_prog1(struct xfs_read_buffer_args *ctx) -{ - char fmt[] = "name: %s, clear_f_mode: %u, f_mode: %u\n"; - struct xfs_writable_file *file = ctx->file; - char name[64] = {}; - char *tmp; - unsigned long i_size; - int len; - - bpf_probe_read(&tmp, 8, &(file->name)); - len = bpf_probe_read_str(name, 64, tmp); - bpf_probe_read(&i_size, 8, &(file->i_size)); - - if (!strncmp("blk_", name, 4)) { - /* blk_xxx.meta or blk_xxx with size < 2M */ - if (len == 27 || (len == 15 && i_size <= 2 * 1024 * 1024)) { - file->f_mode |= FMODE_WILLNEED; - /* blk_xxx */ - } else if (len == 15) { - if (file->prev_pos == file->pos) - file->clear_f_mode |= FMODE_RANDOM; - } - bpf_trace_printk(fmt, sizeof(fmt), name, file->clear_f_mode, - file->f_mode); - } - return 0; -} - -char _license[] SEC("license") = "GPL"; -__u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/test_xfs_file.c b/tools/testing/selftests/bpf/test_xfs_file.c index 247c42be029b2..d0bc971d93bf9 100644 --- a/tools/testing/selftests/bpf/test_xfs_file.c +++ b/tools/testing/selftests/bpf/test_xfs_file.c @@ -18,22 +18,17 @@
int main(int argc, char *argv[]) { - const char *set_file = "./test_set_xfs_file.o"; - const char *clear_file = "./test_clear_xfs_file.o"; - const char *file = set_file; + const char *file = "./test_set_xfs_file.o"; struct bpf_object *obj; int efd, err, prog_fd; int delay = SLEEP_SECS; char *endptr, *str;
- if (argc == 3) { - str = argv[2]; + if (argc == 2) { + str = argv[1]; delay = strtol(str, &endptr, 10); }
- if (argc >= 2 && !strcmp("clear", argv[1])) - file = clear_file; - err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, &obj, &prog_fd); if (err) {
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
There are two issues with the current solution: 1) tracepoint xfs_read_file is visible in tracefs It forms an ABI for userspace. It is bad because new field may be added into xfs_writable_file to export more information to userspace.
2) tracepoint xfs_read_file is specific to xfs HDFS can be stacked on ext4.
This reverts commit b1e9dddb580ac9d589d8dca9787235583f2baa21, but keep the definitions of FMODE_WILLNEED & FMODE_SPC_READAHEAD.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_file.c | 10 +++------- include/uapi/linux/xfs.h | 4 +--- tools/include/uapi/linux/xfs.h | 2 +- 3 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index bd8ae4df20042..ffc388c8b4523 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -35,8 +35,6 @@ #include <linux/mman.h> #include <linux/fadvise.h>
-#define FMODE_MASK (FMODE_RANDOM | FMODE_WILLNEED | FMODE_SPC_READAHEAD) - static const struct vm_operations_struct xfs_file_vm_ops;
int @@ -240,17 +238,15 @@ xfs_file_buffered_aio_read( struct xfs_writable_file file;
file.name = file_dentry(filp)->d_name.name; - file.clear_f_mode = 0; file.f_mode = 0; file.i_size = file_inode(filp)->i_size; - file.prev_pos = filp->f_ra.prev_pos >> PAGE_SHIFT; - file.pos = iocb->ki_pos >> PAGE_SHIFT; + file.prev_pos = filp->f_ra.prev_pos;
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); trace_xfs_file_read(&file, ip, iov_iter_count(to), iocb->ki_pos);
- filp->f_mode |= file.f_mode & FMODE_MASK; - filp->f_mode &= ~(file.clear_f_mode & FMODE_MASK); + if (file.f_mode) + filp->f_mode |= file.f_mode;
if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) diff --git a/include/uapi/linux/xfs.h b/include/uapi/linux/xfs.h index 0a11c2344e5a3..635a83914273b 100644 --- a/include/uapi/linux/xfs.h +++ b/include/uapi/linux/xfs.h @@ -7,11 +7,9 @@
struct xfs_writable_file { const unsigned char *name; - unsigned int clear_f_mode; /* can be cleared from file->f_mode */ unsigned int f_mode; /* can be set into file->f_mode */ long long i_size; /* file size */ - long long prev_pos; /* ra->prev_pos page index */ - long long pos; /* iocb->ki_pos page index */ + long long prev_pos; /* ra->prev_pos */ };
#endif /* _UAPI_LINUX_XFS_H */ diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h index 2c4c61d5ba539..f333a2eb74074 100644 --- a/tools/include/uapi/linux/xfs.h +++ b/tools/include/uapi/linux/xfs.h @@ -5,7 +5,7 @@ #include <linux/types.h>
#define FMODE_RANDOM (0x1000) -#define FMODE_WILLNEED (0x400000) +#define FMODE_WILLNEED (0x40000000)
struct xfs_writable_file { const unsigned char *name;
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
There are two issues with the current solution: 1) tracepoint xfs_read_file is visible in tracefs It forms an ABI for userspace. It is bad because new field may be added into xfs_writable_file to export more information to userspace.
2) tracepoint xfs_read_file is specific to xfs HDFS can be stacked on ext4.
This reverts commit 66844901d6f057eac02868e943f5a70e743785f2.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/include/uapi/linux/xfs.h | 17 ------- tools/testing/selftests/bpf/Makefile | 5 +- .../testing/selftests/bpf/test_set_xfs_file.c | 40 --------------- tools/testing/selftests/bpf/test_xfs_file.c | 51 ------------------- 4 files changed, 2 insertions(+), 111 deletions(-) delete mode 100644 tools/include/uapi/linux/xfs.h delete mode 100644 tools/testing/selftests/bpf/test_set_xfs_file.c delete mode 100644 tools/testing/selftests/bpf/test_xfs_file.c
diff --git a/tools/include/uapi/linux/xfs.h b/tools/include/uapi/linux/xfs.h deleted file mode 100644 index f333a2eb74074..0000000000000 --- a/tools/include/uapi/linux/xfs.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_XFS_H -#define _UAPI_LINUX_XFS_H - -#include <linux/types.h> - -#define FMODE_RANDOM (0x1000) -#define FMODE_WILLNEED (0x40000000) - -struct xfs_writable_file { - const unsigned char *name; - unsigned int f_mode; /* can be set into file->f_mode */ - long long i_size; /* file size */ - long long prev_pos; /* ra->prev_pos */ -}; - -#endif /* _UAPI_LINUX_XFS_H */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1ba656a8ed656..f3f874ba186bb 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -23,8 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ - test_socket_cookie test_cgroup_storage test_select_reuseport test_xfs_file - + test_socket_cookie test_cgroup_storage test_select_reuseport
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -36,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o test_set_xfs_file.o + test_skb_cgroup_id_kern.o
# Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/test_set_xfs_file.c b/tools/testing/selftests/bpf/test_set_xfs_file.c deleted file mode 100644 index 0b289bbc3985d..0000000000000 --- a/tools/testing/selftests/bpf/test_set_xfs_file.c +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/bpf.h> -#include "bpf_helpers.h" -#include <string.h> -#include <linux/xfs.h> - -/* from /sys/kernel/debug/tracing/events/xfs/xfs_read_file */ -struct xfs_read_buffer_args { - struct xfs_writable_file *file; -}; - -SEC("tracepoint/xfs/xfs_file_read") -int bpf_prog1(struct xfs_read_buffer_args *ctx) -{ - char fmt[] = "name: %s, f_mode:%d, i_size:%lu\n"; - struct xfs_writable_file *file = ctx->file; - char name[64] = {}; - char *tmp; - unsigned long i_size; - int len; - - bpf_probe_read(&tmp, 8, &(file->name)); - len = bpf_probe_read_str(name, 64, tmp); - bpf_probe_read(&i_size, 8, &(file->i_size)); - - if (!strncmp("blk_", name, 4)) { - /* blk_xxx.meta or blk_xxx with size < 2M */ - if (len == 27 || (len == 15 && i_size <= 2 * 1024 * 1024)) - file->f_mode |= FMODE_WILLNEED; - /* blk_xxx */ - else if (len == 15) - file->f_mode |= FMODE_RANDOM; - bpf_trace_printk(fmt, sizeof(fmt), name, file->f_mode, i_size); - } - return 0; -} - -char _license[] SEC("license") = "GPL"; -__u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/test_xfs_file.c b/tools/testing/selftests/bpf/test_xfs_file.c deleted file mode 100644 index d0bc971d93bf9..0000000000000 --- a/tools/testing/selftests/bpf/test_xfs_file.c +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> -#include <assert.h> -#include <sys/time.h> -#include <unistd.h> - -#include <linux/bpf.h> -#include <bpf/bpf.h> -#include <bpf/libbpf.h> - -#include "bpf_rlimit.h" - -#define SLEEP_SECS 9999999 - -int main(int argc, char *argv[]) -{ - const char *file = "./test_set_xfs_file.o"; - struct bpf_object *obj; - int efd, err, prog_fd; - int delay = SLEEP_SECS; - char *endptr, *str; - - if (argc == 2) { - str = argv[1]; - delay = strtol(str, &endptr, 10); - } - - err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, &obj, - &prog_fd); - if (err) { - printf("Failed to load xfs program\n"); - goto out; - } - - efd = bpf_raw_tracepoint_open("xfs_file_read", prog_fd); - if (efd < 0) { - printf("Fail to open tracepoint, efd %d\n", efd); - goto out; - } - - sleep(delay); - - printf("END\n"); - -out: - return err; -}
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
There are two issues with the current solution: 1) tracepoint xfs_read_file is visible in tracefs It forms an ABI for userspace. It is bad because new field may be added into xfs_writable_file to export more information to userspace.
2) tracepoint xfs_read_file is specific to xfs HDFS can be stacked on ext4.
This reverts commit 4a0a1e84ab358dc3c151ea8f639ec719be59a782.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_file.c | 13 +----------- fs/xfs/xfs_linux.h | 1 - fs/xfs/xfs_trace.h | 46 ---------------------------------------- include/uapi/linux/xfs.h | 15 ------------- 4 files changed, 1 insertion(+), 74 deletions(-) delete mode 100644 include/uapi/linux/xfs.h
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ffc388c8b4523..1b2eb9d055ba0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -232,21 +232,10 @@ xfs_file_buffered_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *filp = iocb->ki_filp; - struct xfs_inode *ip = XFS_I(file_inode(filp)); + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); ssize_t ret; - struct xfs_writable_file file; - - file.name = file_dentry(filp)->d_name.name; - file.f_mode = 0; - file.i_size = file_inode(filp)->i_size; - file.prev_pos = filp->f_ra.prev_pos;
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - trace_xfs_file_read(&file, ip, iov_iter_count(to), iocb->ki_pos); - - if (file.f_mode) - filp->f_mode |= file.f_mode;
if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 086173507c271..edbd5a210df22 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -60,7 +60,6 @@ typedef __u32 xfs_nlink_t; #include <linux/list_sort.h> #include <linux/ratelimit.h> #include <linux/rhashtable.h> -#include <uapi/linux/xfs.h>
#include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 678a02d145445..a3e7813778b06 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3371,52 +3371,6 @@ TRACE_EVENT(xfs_iunlink_update_bucket, __entry->new_ptr) );
-DECLARE_EVENT_CLASS(xfs_file_read, - TP_PROTO(struct xfs_writable_file *file, struct xfs_inode *ip, - size_t count, loff_t offset), - TP_ARGS(file, ip, count, offset), - TP_STRUCT__entry( - __field(struct xfs_writable_file *, file) - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(loff_t, offset) - __field(size_t, count) - ), - TP_fast_assign( - __entry->file = 0; - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->offset = offset; - __entry->count = count; - ), - TP_printk("dev %d:%d ino 0x%llx offset 0x%llx count 0x%zx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->offset, - __entry->count) -); - -#ifdef DEFINE_EVENT_WRITABLE -#undef XFS_DEFINE_EVENT -#define XFS_DEFINE_EVENT(template, call, proto, args, size) \ - DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ - PARAMS(args), size) -#else -#undef XFS_DEFINE_EVENT -#define XFS_DEFINE_EVENT(template, call, proto, args, size) \ - DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) -#endif - -XFS_DEFINE_EVENT(xfs_file_read, xfs_file_read, - - TP_PROTO(struct xfs_writable_file *file, struct xfs_inode *ip, - size_t count, loff_t offset), - - TP_ARGS(file, ip, count, offset), - - sizeof(struct xfs_writable_file) -); - #endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH diff --git a/include/uapi/linux/xfs.h b/include/uapi/linux/xfs.h deleted file mode 100644 index 635a83914273b..0000000000000 --- a/include/uapi/linux/xfs.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ - -#ifndef _UAPI_LINUX_XFS_H -#define _UAPI_LINUX_XFS_H - -#include <linux/types.h> - -struct xfs_writable_file { - const unsigned char *name; - unsigned int f_mode; /* can be set into file->f_mode */ - long long i_size; /* file size */ - long long prev_pos; /* ra->prev_pos */ -}; - -#endif /* _UAPI_LINUX_XFS_H */
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
mainline inclusion from mainline-5.10 commit afbe7973173a7ce0a68af8b33e44c967582297be category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
As tracepoints are discouraged from being added in a header because it can cause side effects if other tracepoints are in headers, as well as bloat the kernel as the trace_<tracepoint>() function is not a small inline, the common workaround is to add a function call that calls a wrapper function in a C file that then calls the tracepoint. But as function calls add overhead, this function should only be called when the tracepoint in question is enabled. To get around this overhead, a static_branch can be used to only have the tracepoint wrapper get called when the tracepoint is enabled.
Add a tracepoint_enabled(tp) macro that gets passed the name of the tracepoint, and this becomes a static_branch that is enabled when the tracepoint is enabled and is a nop when the tracepoint is disabled.
Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/trace/tracepoints.rst | 27 +++++++++++++++++++++++ include/linux/tracepoint-defs.h | 34 +++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+)
diff --git a/Documentation/trace/tracepoints.rst b/Documentation/trace/tracepoints.rst index 6e3ce3bf3593a..0cb8d9ca3d608 100644 --- a/Documentation/trace/tracepoints.rst +++ b/Documentation/trace/tracepoints.rst @@ -146,3 +146,30 @@ with jump labels and avoid conditional branches. define tracepoints. Check http://lwn.net/Articles/379903, http://lwn.net/Articles/381064 and http://lwn.net/Articles/383362 for a series of articles with more details. + +If you require calling a tracepoint from a header file, it is not +recommended to call one directly or to use the trace_<tracepoint>_enabled() +function call, as tracepoints in header files can have side effects if a +header is included from a file that has CREATE_TRACE_POINTS set, as +well as the trace_<tracepoint>() is not that small of an inline +and can bloat the kernel if used by other inlined functions. Instead, +include tracepoint-defs.h and use tracepoint_enabled(). + +In a C file:: + + void do_trace_foo_bar_wrapper(args) + { + trace_foo_bar(args); + } + +In the header file:: + + DECLARE_TRACEPOINT(foo_bar); + + static inline void some_inline_function() + { + [..] + if (tracepoint_enabled(foo_bar)) + do_trace_foo_bar_wrapper(args); + [..] + } diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index fb17ab8934575..75f3fefa42d72 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -50,4 +50,38 @@ struct bpf_raw_event_map { #endif } __aligned(32);
+/* + * If a tracepoint needs to be called from a header file, it is not + * recommended to call it directly, as tracepoints in header files + * may cause side-effects and bloat the kernel. Instead, use + * tracepoint_enabled() to test if the tracepoint is enabled, then if + * it is, call a wrapper function defined in a C file that will then + * call the tracepoint. + * + * For "trace_foo_bar()", you would need to create a wrapper function + * in a C file to call trace_foo_bar(): + * void do_trace_foo_bar(args) { trace_foo_bar(args); } + * Then in the header file, declare the tracepoint: + * DECLARE_TRACEPOINT(foo_bar); + * And call your wrapper: + * static inline void some_inlined_function() { + * [..] + * if (tracepoint_enabled(foo_bar)) + * do_trace_foo_bar(args); + * [..] + * } + * + * Note: tracepoint_enabled(foo_bar) is equivalent to trace_foo_bar_enabled() + * but is safe to have in headers, where trace_foo_bar_enabled() is not. + */ +#define DECLARE_TRACEPOINT(tp) \ + extern struct tracepoint __tracepoint_##tp + +#ifdef CONFIG_TRACEPOINTS +# define tracepoint_enabled(tp) \ + static_key_false(&(__tracepoint_##tp).key) +#else +# define tracepoint_enabled(tracepoint) false +#endif + #endif
From: Qais Yousef qais.yousef@arm.com
mainline inclusion from mainline-5.12 commit 6939f4ef16d48f2093f337162cfc041d0e30ed25 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
Some subsystems only have bare tracepoints (a tracepoint with no associated trace event) to avoid the problem of trace events being an ABI that can't be changed.
From bpf presepective, bare tracepoints are what it calls
RAW_TRACEPOINT().
Since bpf assumed there's 1:1 mapping, it relied on hooking to DEFINE_EVENT() macro to create bpf mapping of the tracepoints. Since bare tracepoints use DECLARE_TRACE() to create the tracepoint, bpf had no knowledge about their existence.
By teaching bpf_probe.h to parse DECLARE_TRACE() in a similar fashion to DEFINE_EVENT(), bpf can find and attach to the new raw tracepoints.
Enabling that comes with the contract that changes to raw tracepoints don't constitute a regression if they break existing bpf programs. We need the ability to continue to morph and modify these raw tracepoints without worrying about any ABI.
Update Documentation/bpf/bpf_design_QA.rst to document this contract.
Signed-off-by: Qais Yousef qais.yousef@arm.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Yonghong Song yhs@fb.com Link: https://lore.kernel.org/bpf/20210119122237.2426878-2-qais.yousef@arm.com Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/bpf/bpf_design_QA.rst | 6 ++++++ include/trace/bpf_probe.h | 12 ++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index 6780a6d817458..e2cc405acddb5 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -164,6 +164,12 @@ kernels. The union bpf_attr -> kern_version is checked at load time to prevent accidentally loading kprobe-based bpf programs written for a different kernel. Networking programs don't do kern_version check.
+Q: Are tracepoints part of the stable ABI? +------------------------------------------ +A: NO. Tracepoints are tied to internal implementation details hence they are +subject to change and can break with newer kernels. BPF programs need to change +accordingly when this happens. + Q: How much stack space a BPF program uses? ------------------------------------------- A: Currently all program types are limited to 512 bytes of stack diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index d6e556c0a0852..1144417050e80 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -55,8 +55,7 @@ /* tracepoints with more than 12 arguments will hit build error */ #define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
-#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#define __BPF_DECLARE_TRACE(call, proto, args) \ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ @@ -64,6 +63,10 @@ __bpf_trace_##call(void *__data, proto) \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ }
+#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) + /* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the @@ -106,6 +109,11 @@ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+#undef DECLARE_TRACE +#define DECLARE_TRACE(call, proto, args) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ + __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
#undef DEFINE_EVENT_WRITABLE
From: Hou Tao houtao1@huawei.com
mainline inclusion from mainline-5.16 commit 65223741ae1b759a14cab84ba88888bb025f816d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
Commit 9df1c28bb752 ("bpf: add writable context for raw tracepoints") supports writable context for tracepoint, but it misses the support for bare tracepoint which has no associated trace event.
Bare tracepoint is defined by DECLARE_TRACE(), so adding a corresponding DECLARE_TRACE_WRITABLE() macro to generate a definition in __bpf_raw_tp_map section for bare tracepoint in a similar way to DEFINE_TRACE_WRITABLE().
Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Andrii Nakryiko andrii@kernel.org Link: https://lore.kernel.org/bpf/20211004094857.30868-2-hotforest@gmail.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/trace/bpf_probe.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 1144417050e80..7720fac813892 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -88,8 +88,7 @@ __bpf_trace_tp_map_##call = { \
#define FIRST(x, ...) x
-#undef DEFINE_EVENT_WRITABLE -#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ +#define __CHECK_WRITABLE_BUF_SIZE(call, proto, args, size) \ static inline void bpf_test_buffer_##call(void) \ { \ /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ @@ -98,8 +97,12 @@ static inline void bpf_test_buffer_##call(void) \ */ \ FIRST(proto); \ (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ -} \ -__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) +} + +#undef DEFINE_EVENT_WRITABLE +#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ + __CHECK_WRITABLE_BUF_SIZE(call, PARAMS(proto), PARAMS(args), size) \ + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size)
#undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ @@ -114,9 +117,17 @@ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0)
+#undef DECLARE_TRACE_WRITABLE +#define DECLARE_TRACE_WRITABLE(call, proto, args, size) \ + __CHECK_WRITABLE_BUF_SIZE(call, PARAMS(proto), PARAMS(args), size) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ + __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), size) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+#undef DECLARE_TRACE_WRITABLE #undef DEFINE_EVENT_WRITABLE +#undef __CHECK_WRITABLE_BUF_SIZE #undef __DEFINE_EVENT #undef FIRST
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
Add a writable bare tracepoint fs_file_read() and a bare tracepoint fs_file_release().
A version field is added to fs_file_read() to support extension of fs_file_read_ctx in future.
These two tracepoints need to be exported and will be used by filesystem kernel module.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/read_write.c | 5 +++++ include/linux/fs.h | 17 +++++++++++++++++ include/trace/events/fs.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 include/trace/events/fs.h
diff --git a/fs/read_write.c b/fs/read_write.c index 87ee0764b8735..8ccf9a064f585 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -24,6 +24,8 @@
#include <linux/uaccess.h> #include <asm/unistd.h> +#define CREATE_TRACE_POINTS +#include <trace/events/fs.h>
const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, @@ -2164,3 +2166,6 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range); + +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_release); diff --git a/include/linux/fs.h b/include/linux/fs.h index 44fd601552bf2..7ee16d2fe83b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3585,4 +3585,21 @@ static inline struct sock *io_uring_get_socket(struct file *file) } #endif
+struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_mode; + unsigned int rsvd; + /* clear from f_mode */ + unsigned int clr_f_mode; + /* set into f_mode */ + unsigned int set_f_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + #endif /* _LINUX_FS_H */ diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h new file mode 100644 index 0000000000000..ee82dad9d9dad --- /dev/null +++ b/include/trace/events/fs.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include <linux/types.h> +#include <linux/tracepoint.h> +#include <linux/fs.h> + +#undef FS_DECLARE_TRACE +#ifdef DECLARE_TRACE_WRITABLE +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE_WRITABLE(call, PARAMS(proto), PARAMS(args), size) +#else +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) +#endif + +FS_DECLARE_TRACE(fs_file_read, + TP_PROTO(struct fs_file_read_ctx *ctx, int version), + TP_ARGS(ctx, version), + sizeof(struct fs_file_read_ctx)); + +DECLARE_TRACE(fs_file_release, + TP_PROTO(struct inode *inode, struct file *filp), + TP_ARGS(inode, filp)); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h>
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
fs_file_read_do_trace() uses writable-tracepoint to update f_mode for file read procedure. Also export it to make it being usable for filesystem kernel module.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/read_write.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 13 +++++++++++++ 2 files changed, 46 insertions(+)
diff --git a/fs/read_write.c b/fs/read_write.c index 8ccf9a064f585..c901520646155 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2167,5 +2167,38 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } EXPORT_SYMBOL(vfs_dedupe_file_range);
+#ifdef CONFIG_TRACEPOINTS +static void fs_file_read_ctx_init(struct fs_file_read_ctx *ctx, + struct file *filp, loff_t pos) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->name = file_dentry(filp)->d_name.name; + ctx->f_mode = filp->f_mode; + ctx->key = (unsigned long)filp; + ctx->i_size = file_inode(filp)->i_size; + ctx->prev_index = filp->f_ra.prev_pos >> PAGE_SHIFT; + ctx->index = pos >> PAGE_SHIFT; +} + +#define FS_FILE_READ_VERSION 1 +#define FS_FILE_READ_MODE_MASK (FMODE_RANDOM | FMODE_WILLNEED | FMODE_SPC_READAHEAD) + +void fs_file_read_update_args_by_trace(struct kiocb *iocb) +{ + struct file *filp = iocb->ki_filp; + struct fs_file_read_ctx ctx; + + fs_file_read_ctx_init(&ctx, filp, iocb->ki_pos); + trace_fs_file_read(&ctx, FS_FILE_READ_VERSION); + + if (!ctx.set_f_mode && !ctx.clr_f_mode) + return; + + filp->f_mode |= ctx.set_f_mode & FS_FILE_READ_MODE_MASK; + filp->f_mode &= ~(ctx.clr_f_mode & FS_FILE_READ_MODE_MASK); +} +EXPORT_SYMBOL_GPL(fs_file_read_update_args_by_trace); +#endif + EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_read); EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_release); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7ee16d2fe83b0..bcd2131ca06cc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -38,6 +38,7 @@ #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> +#include <linux/tracepoint-defs.h>
#include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -3602,4 +3603,16 @@ struct fs_file_read_ctx { long long index; };
+#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(fs_file_read); +extern void fs_file_read_update_args_by_trace(struct kiocb *iocb); +#else +static inline void fs_file_read_update_args_by_trace(struct kiocb *iocb) {} +#endif + +static inline void fs_file_read_do_trace(struct kiocb *iocb) +{ + if (tracepoint_enabled(fs_file_read)) + fs_file_read_update_args_by_trace(iocb); +} #endif /* _LINUX_FS_H */
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
Use fs_file_read_do_trace() and trace_fs_file_release() to do that.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_file.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1b2eb9d055ba0..12d8fed01c956 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -34,6 +34,7 @@ #include <linux/backing-dev.h> #include <linux/mman.h> #include <linux/fadvise.h> +#include <trace/events/fs.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -236,6 +237,7 @@ xfs_file_buffered_aio_read( ssize_t ret;
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + fs_file_read_do_trace(iocb);
if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) @@ -1018,6 +1020,7 @@ xfs_file_release( struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); return xfs_release(XFS_I(inode)); }
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
Use fs_file_read_do_trace() and trace_fs_file_release() to do that.
Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: fang wei fangwei1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/file.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 21b0fc0d6ffed..5c3304ddc5ec9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -29,6 +29,7 @@ #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> +#include <trace/events/fs.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -75,6 +76,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (IS_DAX(file_inode(iocb->ki_filp))) return ext4_dax_read_iter(iocb, to); #endif + fs_file_read_do_trace(iocb); return generic_file_read_iter(iocb, to); }
@@ -85,6 +87,8 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) */ static int ext4_release_file(struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
Identify writable tracepoint program by section prefix raw_tracepoint.w/.
The correct way is back-porting from commit ccaf12d6215a ("libbpf: Support detecting and attaching of writable tracepoint program"), but the refactoring of libbpf makes it hard, so using the same section prefix as ccaf12d6215a and post a home-made patch instead.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d49b86d492cdb..ccfd17bd0f1be 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2119,6 +2119,7 @@ static const struct { BPF_PROG_SEC("action", BPF_PROG_TYPE_SCHED_ACT), BPF_PROG_SEC("tracepoint/", BPF_PROG_TYPE_TRACEPOINT), BPF_PROG_SEC("raw_tracepoint/", BPF_PROG_TYPE_RAW_TRACEPOINT), + BPF_PROG_SEC("raw_tracepoint.w/", BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("cgroup/skb", BPF_PROG_TYPE_CGROUP_SKB),
From: Hou Tao houtao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H3JT CVE: NA
---------------------------
It attaches eBPF program into fs_file_read() and fs_file_release() respectively. The program for fs_file_read() will record read history, calculate read pattern and set f_mode for specific file, And program for fs_file_release() will clean the saved read history.
Signed-off-by: Hou Tao houtao1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/testing/selftests/bpf/Makefile | 2 + .../testing/selftests/bpf/file_read_pattern.c | 73 +++++++++ .../selftests/bpf/file_read_pattern_prog.c | 142 ++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 tools/testing/selftests/bpf/file_read_pattern.c create mode 100644 tools/testing/selftests/bpf/file_read_pattern_prog.c
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f3f874ba186bb..188fccde8c089 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -24,6 +24,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ test_socket_cookie test_cgroup_storage test_select_reuseport +TEST_GEN_PROGS += file_read_pattern
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -36,6 +37,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ test_skb_cgroup_id_kern.o +TEST_GEN_FILES += file_read_pattern_prog.o
# Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/file_read_pattern.c b/tools/testing/selftests/bpf/file_read_pattern.c new file mode 100644 index 0000000000000..81e3a49f04246 --- /dev/null +++ b/tools/testing/selftests/bpf/file_read_pattern.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_rlimit.h" + +#define READ_TP_NAME "fs_file_read" +#define RELEASE_TP_NAME "fs_file_release" + +int main(int argc, char *argv[]) +{ + const char *name = "./file_read_pattern_prog.o"; + struct bpf_object *obj; + const char *prog_name; + struct bpf_program *prog; + int unused; + int err; + int read_fd; + int release_fd; + + err = bpf_prog_load(name, BPF_PROG_TYPE_UNSPEC, &obj, &unused); + if (err) { + printf("Failed to load program\n"); + return err; + } + + prog_name = "raw_tracepoint.w/" READ_TP_NAME; + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) { + printf("no prog %s\n", prog_name); + err = -EINVAL; + goto out; + } + + read_fd = bpf_raw_tracepoint_open(READ_TP_NAME, bpf_program__fd(prog)); + if (read_fd < 0) { + err = -errno; + printf("Failed to attach raw tracepoint %s\n", READ_TP_NAME); + goto out; + } + + prog_name = "raw_tracepoint/" RELEASE_TP_NAME; + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) { + printf("no prog %s\n", prog_name); + err = -EINVAL; + goto out; + } + + release_fd = bpf_raw_tracepoint_open(RELEASE_TP_NAME, + bpf_program__fd(prog)); + if (release_fd < 0) { + err = -errno; + printf("Failed to attach raw tracepoint %s\n", RELEASE_TP_NAME); + goto out; + } + + pause(); + + close(release_fd); + close(read_fd); +out: + bpf_object__close(obj); + return err; +} diff --git a/tools/testing/selftests/bpf/file_read_pattern_prog.c b/tools/testing/selftests/bpf/file_read_pattern_prog.c new file mode 100644 index 0000000000000..4539eefbf942b --- /dev/null +++ b/tools/testing/selftests/bpf/file_read_pattern_prog.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include <stdbool.h> +#include <string.h> +#include <linux/bpf.h> + +#include "bpf_helpers.h" + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +/* Need to keep consistent with definitions in include/linux/fs.h */ +#define FMODE_RANDOM 0x1000 +#define FMODE_WILLNEED 0x400000 + +struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_mode; + unsigned int rsvd; + /* clear from f_mode */ + unsigned int clr_f_mode; + /* set into f_mode */ + unsigned int set_f_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + +struct fs_file_read_args { + struct fs_file_read_ctx *ctx; + int version; +}; + +struct fs_file_release_args { + void *inode; + void *filp; +}; + +struct file_rd_hist { + __u64 last_nsec; + __u32 seq_nr; + __u32 tot_nr; +}; + +struct bpf_map_def SEC("maps") htab = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(struct file_rd_hist), + .max_entries = 10000, +}; + +static __always_inline bool is_expected_file(void *name) +{ + char prefix[5]; + int err; + + err = bpf_probe_read_str(&prefix, sizeof(prefix), name); + if (err <= 0) + return false; + return !strncmp(prefix, "blk_", 4); +} + +SEC("raw_tracepoint.w/fs_file_read") +int fs_file_read(struct fs_file_read_args *args) +{ + const char fmt[] = "elapsed %llu, seq %u, tot %u\n"; + struct fs_file_read_ctx *rd_ctx = args->ctx; + struct file_rd_hist *hist; + struct file_rd_hist new_hist; + __u64 key; + __u64 now; + bool first; + + if (!is_expected_file((void *)rd_ctx->name)) + return 0; + + if (rd_ctx->i_size <= (4 << 20)) { + rd_ctx->set_f_mode = FMODE_WILLNEED; + return 0; + } + + first = false; + now = bpf_ktime_get_ns(); + key = rd_ctx->key; + hist = bpf_map_lookup_elem(&htab, &key); + if (!hist) { + __builtin_memset(&new_hist, 0, sizeof(new_hist)); + new_hist.last_nsec = now; + first = true; + hist = &new_hist; + } + + if (rd_ctx->index >= rd_ctx->prev_index && + rd_ctx->index - rd_ctx->prev_index <= 1) + hist->seq_nr += 1; + hist->tot_nr += 1; + + bpf_trace_printk(fmt, sizeof(fmt), now - hist->last_nsec, + hist->seq_nr, hist->tot_nr); + + if (first) { + bpf_map_update_elem(&htab, &key, hist, 0); + return 0; + } + + /* 500ms or 10 read */ + if (now - hist->last_nsec >= 500000000ULL || hist->tot_nr >= 10) { + if (hist->tot_nr >= 10) { + if (hist->seq_nr <= hist->tot_nr * 3 / 10) + rd_ctx->set_f_mode = FMODE_RANDOM; + else if (hist->seq_nr >= hist->tot_nr * 7 / 10) + rd_ctx->clr_f_mode = FMODE_RANDOM; + } + + hist->last_nsec = now; + hist->tot_nr = 0; + hist->seq_nr = 0; + } + + return 0; +} + +SEC("raw_tracepoint/fs_file_release") +int fs_file_release(struct fs_file_release_args *args) +{ + __u64 key = (unsigned long)args->filp; + void *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (value) + bpf_map_delete_elem(&htab, &key); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1;