v1->v2: Move local_skb to a reserved field instead of inserting it to a structure hole. Fix a cgroup ref leak bug.
Hengqi Chen (1): libbpf: Support uniform BTF-defined key/value specification across all BPF maps
Liu Jian (4): cgroup: make cgroup_bpf_prog_attach work when cgroup2 is not mounted net: let sockops can use bpf_get_current_comm() net: add local_skb parameter to identify local tcp connection tools: add sample sockmap code for redis
Xu Kuohai (1): bpf, sockmap: Fix map type error in sock_map_del_link
Yosry Ahmed (1): cgroup: add cgroup_v1v2_get_from_[fd/file]()
include/linux/cgroup.h | 2 + include/linux/filter.h | 1 + include/linux/skbuff.h | 2 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 8 +- kernel/cgroup/cgroup.c | 72 +++++++- net/core/filter.c | 9 + net/core/sock_map.c | 10 +- net/ipv4/tcp_input.c | 4 +- net/ipv4/tcp_output.c | 4 + tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 24 +++ tools/netacc/Makefile | 24 +++ tools/netacc/bpf_sockmap.h | 167 +++++++++++++++++++ tools/netacc/net-acc | 35 ++++ tools/netacc/netacc.c | 296 +++++++++++++++++++++++++++++++++ tools/netacc/netaccsockmap.c | 176 ++++++++++++++++++++ 17 files changed, 820 insertions(+), 16 deletions(-) create mode 100644 tools/netacc/Makefile create mode 100644 tools/netacc/bpf_sockmap.h create mode 100755 tools/netacc/net-acc create mode 100644 tools/netacc/netacc.c create mode 100644 tools/netacc/netaccsockmap.c
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/2699 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/2699 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...
From: Hengqi Chen hengqi.chen@gmail.com
mainline inclusion from mainline-v5.16-rc1 commit f731052325efc3726577feb743c7495f880ae07d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7DNAP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
A bunch of BPF maps do not support specifying BTF types for key and value. This is non-uniform and inconvenient[0]. Currently, libbpf uses a retry logic which removes BTF type IDs when BPF map creation failed. Instead of retrying, this commit recognizes those specialized maps and removes BTF type IDs when creating BPF map.
[0] Closes: https://github.com/libbpf/libbpf/issues/355
Signed-off-by: Hengqi Chen hengqi.chen@gmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Link: https://lore.kernel.org/bpf/20210930161456.3444544-2-hengqi.chen@gmail.com (cherry picked from commit f731052325efc3726577feb743c7495f880ae07d) Signed-off-by: Liu Jian liujian56@huawei.com
Conflicts: tools/lib/bpf/libbpf.c --- tools/lib/bpf/libbpf.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9dd29b39010c..283881242222 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4229,6 +4229,30 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) create_attr.inner_map_fd = map->inner_map_fd; }
+ switch (def->type) { + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_STACK_TRACE: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_RINGBUF: + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + default: + break; + } + map->fd = bpf_create_map_xattr(&create_attr); if (map->fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) {
From: Xu Kuohai xukuohai@huawei.com
mainline inclusion from mainline-v6.5-rc6 commit 7e96ec0e6605b69bb21bbf6c0ff9051e656ec2b1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7DNAP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
sock_map_del_link() operates on both SOCKMAP and SOCKHASH, although both types have member named "progs", the offset of "progs" member in these two types is different, so "progs" should be accessed with the real map type.
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Xu Kuohai xukuohai@huawei.com Reviewed-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/r/20230804073740.194770-2-xukuohai@huaweicloud.com Signed-off-by: Martin KaFai Lau martin.lau@kernel.org Signed-off-by: Liu Jian liujian56@huawei.com
Conflicts: net/core/sock_map.c --- net/core/sock_map.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 66b7f3fb01ed..93f2b7893095 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,6 +24,8 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); + static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; @@ -157,11 +159,11 @@ static void sock_map_del_link(struct sock *sk, list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; - struct bpf_stab *stab = container_of(map, struct bpf_stab, - map); - if (psock->parser.enabled && stab->progs.skb_parser) + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->parser.enabled && progs->skb_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->parser.enabled && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link);
From: Yosry Ahmed yosryahmed@google.com
mainline inclusion from mainline-v6.1-rc2 commit a6d1ce5951185ee91bbe6909fe2758f3625561b0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7DNAP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Add cgroup_v1v2_get_from_fd() and cgroup_v1v2_get_from_file() that support both cgroup1 and cgroup2.
Signed-off-by: Yosry Ahmed yosryahmed@google.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Liu Jian liujian56@huawei.com --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 50 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 6 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a60e9966b32d..0859f1bfe5b0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -106,6 +106,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); +struct cgroup *cgroup_v1v2_get_from_fd(int fd);
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 24657a31d264..4d048e10be15 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6190,16 +6190,36 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); }
-static struct cgroup *cgroup_get_from_file(struct file *f) +/** + * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer + * @f: file corresponding to cgroup_dir + * + * Find the cgroup from a file pointer associated with a cgroup directory. + * Returns a pointer to the cgroup on success. ERR_PTR is returned if the + * cgroup cannot be found. + */ +static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; - struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css);
- cgrp = css->cgroup; + return css->cgroup; +} + +/** + * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports + * cgroup2. + */ +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); @@ -6667,14 +6687,14 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/** * cgroup_get_from_fd - get a cgroup pointer from a fd - * @fd: fd obtained by open(cgroup2_dir) + * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ -struct cgroup *cgroup_get_from_fd(int fd) +struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct file *f; @@ -6683,10 +6703,28 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF);
- cgrp = cgroup_get_from_file(f); + cgrp = cgroup_v1v2_get_from_file(f); fput(f); return cgrp; } + +/** + * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports + * cgroup2. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; +} EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
static u64 power_of_ten(int power)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7DNAP CVE: N/A
----------------------------------------------------
BPF_PROG_TYPE_CGROUP* bpf programs is associated with cgroup2. If cgroup2 is not mounted, the bpf program is associated with cgrp_dfl_root.cgrp by default.
Then we can use it like below: bpftool cgroup attach /sys/fs/cgroup/cpu sock_ops pinned /sys/fs/bpf/xxx
Signed-off-by: Liu Jian liujian56@huawei.com --- v1->v2: Fix a cgroup ref leak bug. include/linux/cgroup.h | 1 + kernel/bpf/cgroup.c | 8 ++++---- kernel/cgroup/cgroup.c | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0859f1bfe5b0..e706ff15ec88 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -106,6 +106,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); +struct cgroup *cgroup_get_from_fd_v2(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd);
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e84002f11866..8750004f80a0 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -848,7 +848,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, struct cgroup *cgrp; int ret;
- cgrp = cgroup_get_from_fd(attr->target_fd); + cgrp = cgroup_get_from_fd_v2(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp);
@@ -876,7 +876,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) struct cgroup *cgrp; int ret;
- cgrp = cgroup_get_from_fd(attr->target_fd); + cgrp = cgroup_get_from_fd_v2(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp);
@@ -993,7 +993,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (attr->link_create.flags) return -EINVAL;
- cgrp = cgroup_get_from_fd(attr->link_create.target_fd); + cgrp = cgroup_get_from_fd_v2(attr->link_create.target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp);
@@ -1033,7 +1033,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, struct cgroup *cgrp; int ret;
- cgrp = cgroup_get_from_fd(attr->query.target_fd); + cgrp = cgroup_get_from_fd_v2(attr->query.target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4d048e10be15..3d778636f2e8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6727,6 +6727,28 @@ struct cgroup *cgroup_get_from_fd(int fd) } EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+/** + * same with cgroup_get_from_fd, only add cgrp_dfl_visible check + */ +struct cgroup *cgroup_get_from_fd_v2(int fd) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + if (cgrp_dfl_visible) + return ERR_PTR(-EBADF); + + cgrp = &cgrp_dfl_root.cgrp; + cgroup_get(cgrp); + } + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd_v2); + static u64 power_of_ten(int power) { u64 v = 1;
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7DNAP CVE: N/A
----------------------------------------------------
Let sockops can use bpf_get_current_comm().
Signed-off-by: Liu Jian liujian56@huawei.com --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/net/core/filter.c b/net/core/filter.c index c1623a2ac079..6268ef7c8735 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7464,6 +7464,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; default: return bpf_sk_base_func_proto(func_id); }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7DNAP CVE: N/A
----------------------------------------------------
Add the local_skb parameter to struct sk_buff to identify the local connection. Currently, this function is used only on BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB and BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB to help the sockops bpf program check whether the current connection is a local connection. Updating the local_skb variable only when the ACK packet is sent is sufficient for this function to work.
Signed-off-by: Liu Jian liujian56@huawei.com --- v1->v2: Move local_skb to a reserved field instead of inserting it to a structure hole.
include/linux/filter.h | 1 + include/linux/skbuff.h | 2 +- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 7 +++++++ net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_output.c | 4 ++++ tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h index a2c9ca9626e9..4479a49a4f7c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1284,6 +1284,7 @@ struct bpf_sock_ops_kern { u8 op; u8 is_fullsock; u8 remaining_opt_len; + u8 local_skb; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d16c8bd085f3..a104fdd74aba 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -922,7 +922,7 @@ struct sk_buff { __u32 headers_end[0]; /* public: */
- KABI_USE(1, __u8 scm_io_uring:1) + KABI_USE2(1, __u8 scm_io_uring:1, __u8 local_skb:1) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79d5e5850bf6..2f9e57e99bda 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4707,6 +4707,7 @@ struct bpf_sock_ops { * the outgoing header has not * been written yet. */ + __u32 local_skb; };
/* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index 6268ef7c8735..18d17598fcee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9670,6 +9670,13 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, tcp_flags), si->dst_reg, si->dst_reg, off); break; + case offsetof(struct bpf_sock_ops, local_skb): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + local_skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + local_skb)); + break; } return insn - insn_buf; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 58a8f211b997..c18a0300abde 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -185,8 +185,10 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, sock_ops.is_fullsock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ - if (skb) + if (skb) { bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + sock_ops.local_skb = skb->local_skb; + }
BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 680d7f06a51c..c3d87299f0ce 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3671,6 +3671,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif
+ skb->local_skb = 1; + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts);
@@ -3910,6 +3912,7 @@ int tcp_connect(struct sock *sk) if (unlikely(!buff)) return -ENOBUFS;
+ buff->local_skb = 1; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp(tp); @@ -4028,6 +4031,7 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
/* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); + buff->local_skb = 1; tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
/* We do not want pure acks influencing TCP Small Queues or fq/pacing diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 807232f0c7e0..96554988d8a8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4707,6 +4707,7 @@ struct bpf_sock_ops { * the outgoing header has not * been written yet. */ + __u32 local_skb; };
/* Definitions for bpf_sock_ops_cb_flags */
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7DNAP CVE: N/A
----------------------------------------------------
add sample sockmap code for redis
Signed-off-by: Liu Jian liujian56@huawei.com --- tools/netacc/Makefile | 24 +++ tools/netacc/bpf_sockmap.h | 167 ++++++++++++++++++++ tools/netacc/net-acc | 35 +++++ tools/netacc/netacc.c | 296 +++++++++++++++++++++++++++++++++++ tools/netacc/netaccsockmap.c | 176 +++++++++++++++++++++ 5 files changed, 698 insertions(+) create mode 100644 tools/netacc/Makefile create mode 100644 tools/netacc/bpf_sockmap.h create mode 100755 tools/netacc/net-acc create mode 100644 tools/netacc/netacc.c create mode 100644 tools/netacc/netaccsockmap.c
diff --git a/tools/netacc/Makefile b/tools/netacc/Makefile new file mode 100644 index 000000000000..20aa35d97551 --- /dev/null +++ b/tools/netacc/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 + +INSTALL ?= install +CLANG ?= clang +CC ?= gcc +BPFTOOL ?= bpftool +TOPDIR ?= ../.. +MKFLAGS = -I$(TOPDIR)/tools/lib -I$(TOPDIR)/tools/include/uapi/ +LDLIBBPF = -L$(TOPDIR)/tools/lib/bpf/ -l:libbpf.a + +all: + $(CLANG) -O2 -g -Wall -target bpf $(MKFLAGS) -c netaccsockmap.c -o netaccsockmap.o + $(BPFTOOL) gen skeleton netaccsockmap.o > netaccsockmap.skel.h + $(CC) -O2 -g -Wall $(MKFLAGS) netacc.c -o netacc $(LDLIBBPF) -lelf -lz + +clean: + rm -f netacc + rm -f netaccsockmap.skel.h + rm -f *.o + +install: + mkdir -p $(INSTALL_ROOT)/usr/sbin/tuned_acc/ + $(INSTALL) -m 755 net-acc $(INSTALL_ROOT)/usr/sbin/ + $(INSTALL) -m 755 netacc $(INSTALL_ROOT)/usr/sbin/tuned_acc/ diff --git a/tools/netacc/bpf_sockmap.h b/tools/netacc/bpf_sockmap.h new file mode 100644 index 000000000000..44dd3fee9ac3 --- /dev/null +++ b/tools/netacc/bpf_sockmap.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Huawei Technologies Co., Ltd + */ + +#ifndef __BPF_SOCKMAP_H__ +#define __BPF_SOCKMAP_H__ + +#include <stddef.h> +#include <stdbool.h> +#include <linux/types.h> +#include <linux/bpf.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define LOG_DEBUG 0 +#define SOCKMAP_SIZE 100000 + +#if LOG_DEBUG +#define net_dbg bpf_printk +#define net_err bpf_printk +#else +#define net_dbg(fmt, ...) do {} while (0) +#define net_err bpf_printk +#endif + +/* Unless otherwise specified, change ipaddr to network byte order */ +struct sock_key { + __u32 sip4; + __u32 dip4; + __u32 sport; + __u32 dport; + __u64 netns_cookie; +} __attribute__((packed)); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __type(key, struct sock_key); + __type(value, int); + __uint(max_entries, SOCKMAP_SIZE); + __uint(map_flags, 0); +} netaccsock_map SEC(".maps"); + +struct sock_info { + __u64 redir_rx_cnt; + __u64 redir_tx_cnt; + int sk_flags; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct sock_key); + __type(value, struct sock_info); + __uint(max_entries, SOCKMAP_SIZE); + __uint(map_flags, 0); +} sockflag_map SEC(".maps"); + +/* in network byte order */ +#define IS_LOOPBACK(a) ((((__u32) (a)) & 0x000000ff) == 0x0000007f) +#define IS_NOT_LOOPBACK(a) ((((__u32) (a)) & 0x000000ff) != 0x0000007f) + +static inline void sock_key_add_netnsinfo(void *const ctx, struct sock_key *key) +{ + if (IS_NOT_LOOPBACK(key->sip4) || IS_NOT_LOOPBACK(key->dip4)) + key->netns_cookie = 0; + else + key->netns_cookie = bpf_get_netns_cookie(ctx); +} + +static inline void sock_key2peerkey(struct sock_key *key, struct sock_key *peer_key) +{ + peer_key->sip4 = key->dip4; + peer_key->sport = key->dport; + peer_key->dip4 = key->sip4; + peer_key->dport = key->sport; +} + +static inline void extract_key4_from_ops(struct bpf_sock_ops *ops, struct sock_key *key) +{ + key->dip4 = ops->remote_ip4; + key->sip4 = ops->local_ip4; + + // local_port is in host byte order + // and remote_port is in network byte order + key->sport = ops->local_port; + key->dport = bpf_ntohl(ops->remote_port); +} + +static inline void bpf_sock_ops_ipv4(struct bpf_sock_ops *skops) +{ + struct sock_key key = {}; + + extract_key4_from_ops(skops, &key); + sock_key_add_netnsinfo(skops, &key); + + bpf_sock_hash_update(skops, &netaccsock_map, &key, BPF_NOEXIST); +} + +static inline void bpf_sockmap_ipv4_insert(struct bpf_sock_ops *skops) +{ + if (bpf_ntohl(skops->remote_port) == 22 || skops->local_port == 22) + return; + + bpf_sock_ops_ipv4(skops); +} + +static inline void bpf_sockmap_ipv4_cleanup(struct bpf_sock_ops *skops, __u64 *cnt) +{ + struct sock_info *p_skinfo = NULL; + struct sock_key key = {}; + + extract_key4_from_ops(skops, &key); + sock_key_add_netnsinfo(skops, &key); + p_skinfo = bpf_map_lookup_elem(&sockflag_map, &key); + if (p_skinfo) { + if (cnt) + *cnt = p_skinfo->redir_tx_cnt; + bpf_map_delete_elem(&sockflag_map, &key); + } +} + +static inline void extract_key4_from_msg(struct sk_msg_md *msg, struct sock_key *key) +{ + key->sip4 = msg->local_ip4; + key->dip4 = msg->remote_ip4; + + // local_port is in host byte order + // and remote_port is in network byte order + key->sport = msg->local_port; + key->dport = bpf_ntohl(msg->remote_port); +} + +SEC("sk_msg") int netacc_redir(struct sk_msg_md *msg) +{ + struct sock_info *p_skinfo = NULL; + struct sock_info skinfo = {0}; + struct sock_key peer_key = {}; + struct sock_key key = {}; + int ret, addinfo = 0; + + extract_key4_from_msg(msg, &key); + sock_key_add_netnsinfo(msg, &key); + sock_key2peerkey(&key, &peer_key); + sock_key_add_netnsinfo(msg, &peer_key); + + p_skinfo = bpf_map_lookup_elem(&sockflag_map, &key); + if (p_skinfo != NULL && p_skinfo->sk_flags == 1) + return SK_PASS; + + if (p_skinfo == NULL) { + addinfo = 1; + p_skinfo = &skinfo; + } + + ret = bpf_msg_redirect_hash(msg, &netaccsock_map, &peer_key, BPF_F_INGRESS); + if (ret == SK_DROP) { + if (p_skinfo->sk_flags != 1) + p_skinfo->sk_flags = 1; + } + + p_skinfo->redir_tx_cnt++; + if (addinfo) + bpf_map_update_elem(&sockflag_map, &key, p_skinfo, BPF_ANY); + + return SK_PASS; +} +#endif diff --git a/tools/netacc/net-acc b/tools/netacc/net-acc new file mode 100755 index 000000000000..3b769e84168e --- /dev/null +++ b/tools/netacc/net-acc @@ -0,0 +1,35 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +function usage() { + echo "" + echo "Usage:" + echo " $0 [enable | disable]" + echo "" +} + +function get_cgrp_path() { + local CGRP=`mount | grep cgroup2 | head -n 1` + if [[ "$CGRP"X == "X" ]]; then + CGRP=`mount | grep cgroup | grep "net_cls,net_prio" | head -n 1` + fi + cgrp_path=`echo $CGRP | awk '{print $3}'` +} + +CMD=$1 + +get_cgrp_path +if [[ "$cgrp_path"X == "X" ]]; then + echo "Failed to obtain a valid cgroup mount point." + usage; + exit 1 +fi + +if [[ "$CMD"X == "enableX" ]]; then + /usr/sbin/tuned_acc/netacc enable ${cgrp_path} +elif [[ "$CMD"X == "disableX" ]]; then + /usr/sbin/tuned_acc/netacc disable ${cgrp_path} + exit 0 +else + usage; +fi diff --git a/tools/netacc/netacc.c b/tools/netacc/netacc.c new file mode 100644 index 000000000000..7c22490e2de4 --- /dev/null +++ b/tools/netacc/netacc.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Huawei Technologies Co., Ltd + */ + +#include <argp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <time.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "netaccsockmap.skel.h" + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#endif + +#define CG_PATH "/sys/fs/cgroup/unified" +#define PIN_PATH "/sys/fs/bpf/netacc/" + +static int bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + return setrlimit(RLIMIT_MEMLOCK, &rlim_new); +} + +struct net_acc_prog_info { + const char *prog_name; + const char *pin_path; + void **prog; + int *fd; +}; + +struct net_acc_map_info { + const char *map_name; + char *pin_path; + void **map; + int *fd; +}; + +struct { + int netacc_sockops_fd; + int netacc_redir_fd; + int netaccsock_map_fd; +} net_acc_fds; + +struct { + void *netacc_sockops_obj; + void *netacc_redir_obj; + void *netaccsock_map_obj; +} net_acc_obj; + +static struct net_acc_prog_info prog_infos[] = { + { + .prog_name = "netacc_sockops", + .pin_path = PIN_PATH"sockops", + .prog = &net_acc_obj.netacc_sockops_obj, + .fd = &net_acc_fds.netacc_sockops_fd, + }, + { + .prog_name = "netacc_redir", + .pin_path = PIN_PATH"sk_msg", + .prog = &net_acc_obj.netacc_redir_obj, + .fd = &net_acc_fds.netacc_redir_fd, + } +}; + +static struct net_acc_map_info map_infos[] = { + { + .map_name = "netaccsock_map", + .pin_path = PIN_PATH"netaccsock_map", + .map = &net_acc_obj.netaccsock_map_obj, + .fd = &net_acc_fds.netaccsock_map_fd, + } +}; + +int cg_fd = -1; +struct netaccsockmap *skel; + +int net_acc_enabled(void) +{ + int map_fd; + + map_fd = bpf_obj_get(map_infos[0].pin_path); + if (map_fd < 0) + return 0; + + close(map_fd); + return 1; +} + +int pin_prog_map(void) +{ + int i, mapj, progj; + int err = 0; + + mapj = ARRAY_SIZE(map_infos); + for (i = 0; i < mapj; i++) { + if (*map_infos[i].map) + err = bpf_map__pin(*map_infos[i].map, map_infos[i].pin_path); + if (err) { + mapj = i; + goto err1; + } + } + + progj = ARRAY_SIZE(prog_infos); + for (i = 0; i < progj; i++) { + if (*prog_infos[i].prog) + err = bpf_program__pin(*prog_infos[i].prog, prog_infos[i].pin_path); + if (err) { + progj = i; + goto err2; + } + } + return 0; +err2: + for (i = 0; i < progj; i++) { + if (*prog_infos[i].prog) + bpf_program__unpin(*prog_infos[i].prog, prog_infos[i].pin_path); + } +err1: + for (i = 0; i < mapj; i++) { + if (*map_infos[i].map) + bpf_map__unpin(*map_infos[i].map, map_infos[i].pin_path); + } + return 1; +} + +int attach_manually(void) +{ + int err; + + err = bpf_prog_attach(bpf_program__fd(skel->progs.netacc_sockops), cg_fd, + BPF_CGROUP_SOCK_OPS, 0); + if (err) { + fprintf(stderr, "failed to attach sockops programs, %d\n", err); + return -1; + } + + err = bpf_prog_attach(bpf_program__fd(skel->progs.netacc_redir), + bpf_map__fd(skel->maps.netaccsock_map), BPF_SK_MSG_VERDICT, 0); + if (err) { + fprintf(stderr, "failed to attach msg_verdict programs, %d\n", err); + goto cleanup1; + } + + net_acc_obj.netacc_sockops_obj = skel->progs.netacc_sockops; + net_acc_obj.netacc_redir_obj = skel->progs.netacc_redir; + net_acc_obj.netaccsock_map_obj = skel->maps.netaccsock_map; + return 0; +cleanup1: + bpf_prog_detach2(bpf_program__fd(skel->progs.netacc_sockops), cg_fd, BPF_CGROUP_SOCK_OPS); + return -1; +} + +void detach_manually(void) +{ + int err; + + err = bpf_prog_detach2(bpf_program__fd(skel->progs.netacc_redir), + bpf_map__fd(skel->maps.netaccsock_map), BPF_SK_MSG_VERDICT); + if (err) + fprintf(stderr, "failed to detach msg_verdict programs, %d\n", err); + + err = bpf_prog_detach2(bpf_program__fd(skel->progs.netacc_sockops), cg_fd, + BPF_CGROUP_SOCK_OPS); + if (err) + fprintf(stderr, "failed to detach sockops programs, %d\n", err); +} + +int net_acc_enable(void) +{ + int err; + + if (net_acc_enabled()) + return 0; + + err = bump_memlock_rlimit(); + if (err) { + fprintf(stderr, "failed to increase rlimit: %d", err); + close(cg_fd); + return 1; + } + + skel = netaccsockmap__open(); + if (!skel) { + fprintf(stderr, "failed to open and/or load BPF object\n"); + return 1; + } + + err = netaccsockmap__load(skel); + if (err) { + fprintf(stderr, "failed to load BPF object: %d\n", err); + goto cleanup; + } + + err = netaccsockmap__attach(skel); + if (err) { + fprintf(stderr, "failed to attach BPF programs\n"); + goto cleanup; + } + + err = attach_manually(); + if (err) { + fprintf(stderr, "failed to attach BPF programs\n"); + goto cleanup; + } + + err = pin_prog_map(); + if (err) { + fprintf(stderr, "failed to pin BPF programs and maps\n"); + goto cleanup1; + } + + return 0; + +cleanup1: + detach_manually(); +cleanup: + netaccsockmap__destroy(skel); + close(cg_fd); + + return err != 0; +} + + +int net_acc_disable(void) +{ + int i, err; + + if (!net_acc_enabled()) + return 0; + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) { + if (map_infos[i].fd) { + *map_infos[i].fd = bpf_obj_get(map_infos[i].pin_path); + unlink(map_infos[i].pin_path); + } + } + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + if (prog_infos[i].fd) { + *prog_infos[i].fd = bpf_obj_get(prog_infos[i].pin_path); + unlink(prog_infos[i].pin_path); + } + } + + err = bpf_prog_detach2(net_acc_fds.netacc_redir_fd, + net_acc_fds.netaccsock_map_fd, BPF_SK_MSG_VERDICT); + if (err) + fprintf(stderr, "failed to detach msg_verdict programs, %d\n", err); + err = bpf_prog_detach2(net_acc_fds.netacc_sockops_fd, cg_fd, BPF_CGROUP_SOCK_OPS); + if (err) + fprintf(stderr, "failed to detach msg_verdict programs, %d\n", err); + + close(net_acc_fds.netacc_redir_fd); + close(net_acc_fds.netacc_sockops_fd); + rmdir(PIN_PATH); + return 0; +} + +int main(int argc, char **argv) +{ + char *cgrp_path = CG_PATH; + int ret = 1; + + if (argc != 2 && argc != 3) + return 1; + + if (argc == 3) + cgrp_path = argv[2]; + + cg_fd = open(cgrp_path, O_DIRECTORY, O_RDONLY); + if (cg_fd < 0) { + fprintf(stderr, "ERROR: (%d) open cgroup path failed: %s\n", cg_fd, cgrp_path); + return 1; + } + + if (strncmp(argv[1], "enable", 6) == 0) + ret = net_acc_enable(); + else if (strncmp(argv[1], "disable", 7) == 0) + ret = net_acc_disable(); + + close(cg_fd); + return ret; +} diff --git a/tools/netacc/netaccsockmap.c b/tools/netacc/netaccsockmap.c new file mode 100644 index 000000000000..99c6bd71244b --- /dev/null +++ b/tools/netacc/netaccsockmap.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Huawei Technologies Co., Ltd + */ + +#include "bpf_sockmap.h" + +#define NETACC_BIND_MAP_SIZE 100 + +#define CHECK_ACC_SOCK 1 + +struct ipaddr_port { + __u32 ip4; + __u32 port; +} __attribute__((packed)); + +#if CHECK_ACC_SOCK +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct ipaddr_port); + __type(value, int); + __uint(max_entries, NETACC_BIND_MAP_SIZE); + __uint(map_flags, 0); +} netacc_bind_map SEC(".maps"); + + +static inline int __is_netacc_sock(struct ipaddr_port *key) +{ + int *pv = NULL; + + pv = bpf_map_lookup_elem(&netacc_bind_map, key); + if (pv) + return 1; + + return 0; +} + +static inline int is_netacc_sock(struct ipaddr_port *key1, struct ipaddr_port *key10) +{ + net_dbg("is_netacc, ip1:0x%x, port1:0x%x\n", key1->ip4, key1->port); + + if (__is_netacc_sock(key1)) + return 1; + + if (__is_netacc_sock(key10)) + return 1; + + return 0; +} + +static inline void extract_dst_ipaddrport_from_ops(struct bpf_sock_ops *skops, + struct ipaddr_port *key) +{ + if (skops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) { + key->ip4 = skops->remote_ip4; + // remote_port is in network byte order + key->port = bpf_ntohl(skops->remote_port); + } else if (skops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) { + key->ip4 = skops->local_ip4; + // local_port is in host byte order + key->port = skops->local_port; + } +} + +static inline int is_netacc_interested_tcp(struct bpf_sock_ops *skops) +{ + struct ipaddr_port key = {0}; + struct ipaddr_port key0; + + // only test server's port + extract_dst_ipaddrport_from_ops(skops, &key); + key0.ip4 = 0; + key0.port = key.port; + + if (!is_netacc_sock(&key, &key0)) + return 0; + net_dbg("this is netacc sock\n"); + + net_dbg("the sock is netacc loopback sock\n"); + return 1; +} + +static inline int netacc_strncmp(const char *cs, const char *ct, size_t count) +{ + unsigned char c1, c2; + + while (count) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + count--; + } + return 0; +} + +static inline int update_netacc_info(struct bpf_sock_ops *skops) +{ + struct ipaddr_port key; + int value = 1; + char comm[16] = {0}; + + bpf_get_current_comm(comm, sizeof(comm)); + + if (netacc_strncmp(comm, "redis-server", 12)) + return 0; + + key.ip4 = skops->local_ip4; + key.port = skops->local_port; // host order + + bpf_map_update_elem(&netacc_bind_map, &key, &value, BPF_NOEXIST); + net_dbg("%s, update netaccinfo: sip:0x%x, sport:%d\n", comm, key.ip4, key.port); + return 1; +} + +static inline void clean_netacc_info(struct bpf_sock_ops *skops) +{ + struct ipaddr_port key; + + key.ip4 = skops->local_ip4; + key.port = skops->local_port; // host order + net_dbg("clean netaccinfo, 0x%x:%d\n", key.ip4, key.port); + bpf_map_delete_elem(&netacc_bind_map, &key); +} +#else +static inline int is_netacc_interested_tcp(struct bpf_sock_ops *skops) +{ + return 1; +} +static inline int update_netacc_info(struct bpf_sock_ops *skops) +{ + return 0; +} +static inline void clean_netacc_info(struct bpf_sock_ops *skops) +{} +#endif + +SEC("sockops") int netacc_sockops(struct bpf_sock_ops *skops) +{ + switch (skops->op) { + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + if (skops->family == 2 && skops->local_skb) {// AF_INET + if (is_netacc_interested_tcp(skops)) { + net_dbg("bpf_sockops, sockmap, op:%d, sk:%p\n", + skops->op, skops->sk); + bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG); + bpf_sockmap_ipv4_insert(skops); + } else { + bpf_sock_ops_cb_flags_set(skops, 0); + } + } + break; + case BPF_SOCK_OPS_STATE_CB: + if (skops->family == 2 && skops->args[0] == BPF_TCP_LISTEN && + skops->args[1] == BPF_TCP_CLOSE) { + clean_netacc_info(skops); + } else if (skops->family == 2 && (skops->args[1] == BPF_TCP_CLOSE || + skops->args[1] == BPF_TCP_CLOSE_WAIT || + skops->args[1] == BPF_TCP_FIN_WAIT1)) { + bpf_sockmap_ipv4_cleanup(skops, NULL); + } + break; + case BPF_SOCK_OPS_TCP_LISTEN_CB: + if (skops->family == 2 && update_netacc_info(skops)) + bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG); + break; + default: + break; + } + return 1; +} + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1;