From: Matt Mullins mmullins@fb.com
mainline inclusion from mainline-v5.2-rc1 commit 9df1c28bb75217b244257152ab7d788bb2a386d0 category: feature bugzilla: NA CVE: NA
-------------------------------------------------------------------------
This is an opt-in interface that allows a tracepoint to provide a safe buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. The size of the buffer must be a compile-time constant, and is checked before allowing a BPF program to attach to a tracepoint that uses this feature.
The pointer to this buffer will be the first argument of tracepoints that opt in; the pointer is valid and can be bpf_probe_read() by both BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a tracepoint, but the buffer to which it points may only be written by the latter.
Signed-off-by: Matt Mullins mmullins@fb.com Acked-by: Yonghong Song yhs@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org Conflict: include/linux/bpf.h include/uapi/linux/bpf.h kernel/bpf/verifier.c Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf.h | 9 +++++++++ include/linux/bpf_types.h | 1 + include/linux/tracepoint-defs.h | 1 + include/trace/bpf_probe.h | 27 +++++++++++++++++++++++++-- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/syscall.c | 8 ++++++-- kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 24 ++++++++++++++++++++++++ 8 files changed, 100 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 685bbc0139add..2eed5ecdcdc75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -214,6 +214,14 @@ enum bpf_reg_type { PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ + PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ + PTR_TO_SOCKET, /* reg points to struct bpf_sock */ + PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ + PTR_TO_SOCK_COMMON, /* reg points to sock_common */ + PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ + PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ + PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ };
/* The information passed from prog-specific *_is_valid_access @@ -285,6 +293,7 @@ struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; + u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b3f9156a519ab..b5b9196a30134 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -22,6 +22,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 49ba9cde7e4bb..b29950a19205e 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -45,6 +45,7 @@ struct bpf_raw_event_map { struct tracepoint *tp; void *bpf_func; u32 num_args; + u32 writable_size; } __aligned(32);
#endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 505dae0bed80b..d6e556c0a0852 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto) \ * to make sure that if the tracepoint handling changes, the * bpf probe will fail to compile unless it too is updated. */ -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, call, proto, args) \ +#define __DEFINE_EVENT(template, call, proto, args, size) \ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ @@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ .bpf_func = (void *)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ + .writable_size = size, \ };
+#define FIRST(x, ...) x + +#undef DEFINE_EVENT_WRITABLE +#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ +static inline void bpf_test_buffer_##call(void) \ +{ \ + /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ + * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ + * dead-code-eliminated. \ + */ \ + FIRST(proto); \ + (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ +} \ +__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) + +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0)
#undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef DEFINE_EVENT_WRITABLE +#undef __DEFINE_EVENT +#undef FIRST + #endif /* CONFIG_BPF_EVENTS */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2bbae9e2786ea..b6ff566503d11 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -154,6 +154,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, };
enum bpf_attach_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fc0628d133166..116c8b32c5716 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1578,12 +1578,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } raw_tp->btp = btp;
- prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + }
err = bpf_probe_register(raw_tp->btp, prog); if (err) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b748ec3dd6ba0..623c50e19ece9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -260,6 +260,7 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_TP_BUFFER] = "tp_buffer", };
static char slot_type_char[] = { @@ -1590,6 +1591,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; }
+static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1709,6 +1736,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index bb5e8b1922a62..b8726b8a46022 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -978,6 +978,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { };
+static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1267,6 +1288,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL;
+ if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); }
From: Matt Mullins mmullins@fb.com
mainline inclusion from mainline-v5.2-rc1 commit 4635b0ae4d26f87cf68dbab6740955dd1ad67cf4 category: feature bugzilla: NA CVE: NA
-------------------------------------------------------------------------
This adds BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, and fixes up the
error: enumeration value ‘BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE’ not handled in switch [-Werror=switch-enum]
build errors it would otherwise cause in libbpf.
Signed-off-by: Matt Mullins mmullins@fb.com Acked-by: Yonghong Song yhs@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org Conflict: tools/include/uapi/linux/bpf.h tools/lib/bpf/libbpf.c Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/include/uapi/linux/bpf.h | 12 +++++++++++- tools/lib/bpf/libbpf.c | 1 + 2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f833c0385dd69..4f139c1511ac1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -153,6 +153,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, };
enum bpf_attach_type { @@ -1658,12 +1661,19 @@ union bpf_attr { * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * - * The supported callback values that *argval* can combine are: + * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * * Here are some examples of where one could call such eBPF * program: * diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0ca56681f9cf5..64056dcecf71d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1516,6 +1516,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: default: return true; }
From: "Daniel T. Lee" danieltimlee@gmail.com
mainline inclusion from mainline-v5.3-rc1 commit 216b65fb706e34128a5317d71b300daac9c428c4 category: feature bugzilla: NA CVE: NA
-------------------------------------------------------------------------
From commit 9df1c28bb752 ("bpf: add writable context for raw tracepoints"), a new type of BPF_PROG, RAW_TRACEPOINT_WRITABLE has been added.
Since this BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE is not listed at bpftool's header, it causes a segfault when executing 'bpftool feature'.
This commit adds BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE entry to prog_type_name enum, and will eventually fixes the segfault issue.
Fixes: 9df1c28bb752 ("bpf: add writable context for raw tracepoints") Signed-off-by: Daniel T. Lee danieltimlee@gmail.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Conflict: tools/bpf/bpftool/main.h Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/bpf/bpftool/prog.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 2b5476312cba6..05d69a3bfe335 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -72,6 +72,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", [BPF_PROG_TYPE_SK_MSG] = "sk_msg", [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", };
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
commit 9df1c28bb752 ("bpf: add writable context for raw tracepoints") add new member into struct bpf_prog_aux and struct bpf_raw_event_map, which will break KABI. This patch try to fix it.
Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf.h | 5 +++++ include/linux/tracepoint-defs.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ 3 files changed, 9 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2eed5ecdcdc75..4d5243bb385bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -214,6 +214,7 @@ enum bpf_reg_type { PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ +#ifndef __GENKSYMS__ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ @@ -222,6 +223,7 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ +#endif };
/* The information passed from prog-specific *_is_valid_access @@ -293,7 +295,10 @@ struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; + /* not protected by KABI, safe to extend in the middle */ +#ifndef __GENKSYMS__ u32 max_tp_access; +#endif u32 stack_depth; u32 id; u32 func_cnt; diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index b29950a19205e..fb17ab8934575 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -45,7 +45,9 @@ struct bpf_raw_event_map { struct tracepoint *tp; void *bpf_func; u32 num_args; +#ifndef __GENKSYMS__ u32 writable_size; +#endif } __aligned(32);
#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6ff566503d11..8b66fe767183b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -154,9 +154,11 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, +#ifndef __GENKSYMS__ BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, +#endif };
enum bpf_attach_type {
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
This patch tries to fix follow gcc error:
libbpf.c: In function ‘bpf_prog_type__needs_kver’: libbpf.c:1495:2: error: enumeration value ‘BPF_PROG_TYPE_FLOW_DISSECTOR’ not handled in switch [-Werror=switch-enum] 1495 | switch (type) { | ^~~~~~ libbpf.c:1495:2: error: enumeration value ‘BPF_PROG_TYPE_CGROUP_SYSCTL’ not handled in switch [-Werror=switch-enum] cc1: all warnings being treated as errors mv: cannot stat './.libbpf.o.tmp': No such file or directory
Fixes: 4635b0ae4d26 ("tools: sync bpf.h") Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/lib/bpf/libbpf.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 64056dcecf71d..4c87f82dc2a6a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1510,6 +1510,8 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SYSCTL: return false; case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_KPROBE: