From: Xu Kuohai xukuohai@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5M05G CVE: NA
-------------------------------------------------------
BMC is an in-kernel key-value cache implemented in BPF and proposed by paper [1]. The paper discussed BMC for memcached, obtaining at least 6x performance speedup.
This patch implements a sample BMC for Redis. Paper [1] implements BMC in XDP, bypassing the kernel network stack totally. Since Redis is based on TCP protocol, and it's almost impossible to fully process TCP traffic in XDP, so this patch implements BMC in sockmap, which locates at the top of kernel network stack. Since kernel network stack is not bypassed, the speedup is not significant. Any way, this is only a sample implementation, and performance improvements can be continuously optimized.
See [2] for details on how to build samples/bpf.
Output files: samples/bpf/bmctool samples/bpf/bmc/bpf.o
Sample usage: bmctool prog load -p 6379 ./bmc/bpf.o # load bmc bpf prog and attach it # to sockets with listen port 6379
bmctool stat # dump bmc status
bmctool prog unload # detach and unload bmc prog
[1] https://www.usenix.org/conference/nsdi21/presentation/ghigoff [2] https://www.kernel.org/doc/readme/samples-bpf-README.rst
Signed-off-by: Xu Kuohai xukuohai@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/bmc/bpf.c | 144 ++++++++ samples/bpf/bmc/common.h | 21 ++ samples/bpf/bmc/redis.h | 648 ++++++++++++++++++++++++++++++++++ samples/bpf/bmc/tool.c | 733 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 1549 insertions(+) create mode 100644 samples/bpf/bmc/bpf.c create mode 100644 samples/bpf/bmc/common.h create mode 100644 samples/bpf/bmc/redis.h create mode 100644 samples/bpf/bmc/tool.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index aeebf5d12f32..f9bb6bdad6ce 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -54,6 +54,7 @@ tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += bmctool
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -111,6 +112,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) +bmctool-objs := bmc/tool.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -172,6 +174,7 @@ always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o +always-y += bmc/bpf.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/bmc/bpf.c b/samples/bpf/bmc/bpf.c new file mode 100644 index 000000000000..127260c611f8 --- /dev/null +++ b/samples/bpf/bmc/bpf.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Description: BPF program to accelerate Redis. The idea is to add a kernel + * cache for Redis data. When new Redis request is received, the kernel cache + * is checked, and if the requested data is found in the cache, a Redis reply + * message is constructed and sent back directly. + */ + +#include <uapi/linux/in.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/pkt_cls.h> + +#include <bpf/bpf_helpers.h> + +#define debug(fmt, ...) \ +do { \ + char ___fmt[] = fmt; \ + bpf_trace_printk(___fmt, sizeof(___fmt), ##__VA_ARGS__); \ +} while (0) + +struct tcp_key { + __u32 family; + __be32 local_ip4; + __be32 remote_ip4; + __be32 local_port; + __be32 remote_port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(key_size, sizeof(struct tcp_key)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 1024); +} bmc_socks SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 16); +} bmc_ports SEC(".maps"); + +SEC("bmc/sock_parser") +int sock_parser(struct __sk_buff *skb) +{ + return skb->len; +} + +static void init_tcp_key(struct tcp_key *key, struct bpf_sock *sk) +{ + if (sk != NULL) { + key->family = sk->family; + key->local_ip4 = sk->src_ip4; + key->remote_ip4 = sk->dst_ip4; + key->local_port = htonl(sk->src_port); + key->remote_port = htonl((u32)ntohs(sk->dst_port)); + } +} + +static int sock_redirect(struct __sk_buff *skb) +{ + struct tcp_key key; + struct bpf_sock *sk; + + sk = skb->sk; + if (sk == NULL) + return SK_PASS; + + init_tcp_key(&key, sk); + return bpf_sk_redirect_hash(skb, &bmc_socks, &key, 0); +} + +#include "redis.h" + +SEC("bmc/sock_verdict") +int sock_verdict(struct __sk_buff *skb) +{ + return bmc_process(skb); +} + +static bool is_bmc_port(u32 port) +{ + u32 *val = bpf_map_lookup_elem(&bmc_ports, &port); + + return val != NULL && *val != 0; +} + +static void add_bmc_sock(struct bpf_sock_ops *skops, struct bpf_sock *sk) +{ + struct tcp_key key; + + init_tcp_key(&key, sk); + bpf_sock_hash_update(skops, &bmc_socks, &key, BPF_ANY); +} + +static void delete_bmc_sock(struct bpf_sock_ops *skops, struct bpf_sock *sk) +{ + struct tcp_key key; + + init_tcp_key(&key, sk); + bpf_map_delete_elem(&bmc_socks, &key); +} + +SEC("bmc/sock_ops") +int sock_ops(struct bpf_sock_ops *skops) +{ + int op; + u16 local_port; + struct tcp_key key; + struct bpf_sock *sk; + + sk = skops->sk; + if (skops->family != AF_INET || sk == NULL) + return 0; + + local_port = ntohs((u16)sk->src_port); + + switch ((int)skops->op) { + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + if (is_bmc_port(local_port)) { + bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG); + add_bmc_sock(skops, sk); + } + break; + + case BPF_SOCK_OPS_STATE_CB: + if ((int)skops->args[1] == BPF_TCP_CLOSE) + delete_bmc_sock(skops, sk); + break; + + default: + break; + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/bmc/common.h b/samples/bpf/bmc/common.h new file mode 100644 index 000000000000..51c8623ab4f8 --- /dev/null +++ b/samples/bpf/bmc/common.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * Description: common header for both user prog and bpf kernel prog + */ +#ifndef __REDIS_BMC_COMMON_H__ +#define __REDIS_BMC_COMMON_H__ + +#define REDIS_GET_PROG_INDEX 0 +#define REDIS_SET_PROG_INDEX 1 + +struct redis_bmc_stat { + __u64 total_get_requests; + __u64 hit_get_requests; + __u64 drop_get_requests; + __u64 total_set_requests; + __u64 hit_set_requests; + __u64 drop_set_requests; +}; + +#endif diff --git a/samples/bpf/bmc/redis.h b/samples/bpf/bmc/redis.h new file mode 100644 index 000000000000..6e739ce3d81a --- /dev/null +++ b/samples/bpf/bmc/redis.h @@ -0,0 +1,648 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Description: This file parses REDIS commands. When SET command is received, + * the KEY and VALUE fields are extracted from the message and are stored to + * bmc_storage. When GET command is received, we lookup bmc_storage with the + * KEY received, and if success we fill the reply message with the found VALUE + * and send it back to the client. + * + * Here is a sample redis SET and GET session: + * (C: is the client, S: the server) + * + * C: "*3\r\n$3\r\nset\r\n$5\r\nkey01\r\n$5\r\nval01\r\n" + * S: "+OK\r\n" + * C: "*2\r\n$3\r\nget\r\n$5\r\nkey01\r\n" + * S: "$5\r\nval01\r\n" + * + * See [0] for RESP protocol details. + * [0] https://redis.io/docs/reference/protocol-spec/ + */ + +#include "common.h" + +#define BMC_MAX_REDIS_KEY_LEN 448 /* total key size should be less than 512 */ +#define BMC_MAX_REDIS_VALUE_LEN 2048 +#define BMC_MAX_CPUS 512 // NR_CPUS + +struct redis_key { + u32 len; + u8 data[BMC_MAX_REDIS_KEY_LEN]; +}; + +struct redis_value { + u32 len; + u8 data[BMC_MAX_REDIS_VALUE_LEN]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(key_size, sizeof(struct redis_key)); + __uint(value_size, sizeof(struct redis_value)); + __uint(max_entries, 10000); +} bmc_storage SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 2); +} bmc_jump_table SEC(".maps"); + +struct redis_ctx { + struct redis_key key; + struct redis_value value; + u32 offset; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct redis_ctx)); + __uint(max_entries, BMC_MAX_CPUS); +} ctxmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct redis_bmc_stat)); + __uint(max_entries, BMC_MAX_CPUS); +} bmc_stats SEC(".maps"); + +static int bmc_copy_from_skb(void *dst, void *dend, + struct __sk_buff *skb, + u32 skb_off, u32 len) +{ + u32 i; + u32 off = 0; + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + + if (len > 2047) + return -1; + + if (len >= 1024 && dst + off + 1024 < dend && + data + skb_off + off + 1024 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 1024)) + return -1; + off += 1024; + len -= 1024; + } + if (len >= 512 && dst + off + 512 < dend && + data + skb_off + off + 512 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 512)) + return -1; + off += 512; + len -= 512; + } + if (len >= 256 && dst + off + 256 < dend && + data + skb_off + off + 256 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 256)) + return -1; + off += 256; + len -= 256; + } + if (len >= 128 && dst + off + 128 < dend && + data + skb_off + off + 128 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 128)) + return -1; + off += 128; + len -= 128; + } + if (len >= 64 && dst + off + 64 < dend && + data + skb_off + off + 64 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 64)) + return -1; + off += 64; + len -= 64; + } + if (len >= 32 && dst + off + 32 < dend && + data + skb_off + off + 32 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 32)) + return -1; + off += 32; + len -= 32; + } + if (len >= 16 && dst + off + 16 < dend && + data + skb_off + off + 16 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 16)) + return -1; + off += 16; + len -= 16; + } + + if (len >= 8 && dst + off + 8 < dend && + data + skb_off + off + 8 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 8)) + return -1; + off += 8; + len -= 8; + } + + if (len >= 4 && dst + off + 4 < dend && + data + skb_off + off + 4 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 4)) + return -1; + off += 4; + len -= 4; + } + + if (len >= 2 && dst + off + 2 < dend && + data + skb_off + off + 2 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 2)) + return -1; + off += 2; + len -= 2; + } + + if (len >= 1 && dst + off + 1 < dend && + data + skb_off + off + 1 < data_end) { + if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 1)) + return -1; + off += 1; + len -= 1; + } + + return len == 0 ? 0 : -1; +} + +static int bmc_copy_to_skb(struct __sk_buff *skb, u32 skb_off, + void *dst, void *dend, u32 len) +{ + u32 i; + u32 off = 0; + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + + if (len > 2047) + return -1; + + if (len >= 1024 && dst + off + 1024 < dend && + data + skb_off + off + 1024 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 1024, 0)) + return -1; + off += 1024; + len -= 1024; + } + if (len >= 512 && dst + off + 512 < dend && + data + skb_off + off + 512 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 512, 0)) + return -1; + off += 512; + len -= 512; + } + if (len >= 256 && dst + off + 256 < dend && + data + skb_off + off + 256 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 256, 0)) + return -1; + off += 256; + len -= 256; + } + if (len >= 128 && dst + off + 128 < dend && + data + skb_off + off + 128 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 128, 0)) + return -1; + off += 128; + len -= 128; + } + if (len >= 64 && dst + off + 64 < dend && + data + skb_off + off + 64 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 64, 0)) + return -1; + off += 64; + len -= 64; + } + if (len >= 32 && dst + off + 32 < dend && + data + skb_off + off + 32 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 32, 0)) + return -1; + off += 32; + len -= 32; + } + if (len >= 16 && dst + off + 16 < dend && + data + skb_off + off + 16 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 16, 0)) + return -1; + off += 16; + len -= 16; + } + + if (len >= 8 && dst + off + 8 < dend && + data + skb_off + off + 8 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 8, 0)) + return -1; + off += 8; + len -= 8; + } + + if (len >= 4 && dst + off + 4 < dend && + data + skb_off + off + 4 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 4, 0)) + return -1; + off += 4; + len -= 4; + } + + if (len >= 2 && dst + off + 2 < dend && + data + skb_off + off + 2 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 2, 0)) + return -1; + off += 2; + len -= 2; + } + + if (len >= 1 && dst + off + 1 < dend && + data + skb_off + off + 1 < data_end) { + if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 1, 0)) + return -1; + off += 1; + len -= 1; + } + + return len == 0 ? 0 : -1; +} + +static inline struct redis_ctx *get_ctx(void) +{ + u32 cpu = bpf_get_smp_processor_id(); + + if (cpu >= BMC_MAX_CPUS) + return NULL; + return bpf_map_lookup_elem(&ctxmap, &cpu); +} + +static inline struct redis_bmc_stat *get_stat(void) +{ + u32 cpu = bpf_get_smp_processor_id(); + + if (cpu >= BMC_MAX_CPUS) + return NULL; + return bpf_map_lookup_elem(&bmc_stats, &cpu); +} + +static int do_redis_get_handler(struct __sk_buff *skb, struct redis_ctx *ctx) +{ + int i; + u32 n; + int err; + char *p; + char *data; + char *data_end; + char buf[5]; + struct redis_value *val; + + ctx = get_ctx(); + if (!ctx) + return BPF_OK; + + val = bpf_map_lookup_elem(&bmc_storage, &ctx->key); + if (val == NULL || val->len == 0 || val->len > sizeof(val->data)) + return BPF_OK; + + n = val->len; + + i = 0; + while (i < 5) { + buf[i] = '0' + n % 10; + n = n / 10; + i++; + if (n == 0) + break; + } + + if (i >= 5) + return BPF_OK; + + /* $ LEN \r \n VALUE \r \n */ + n = 1 + i + 2 + val->len + 2; + + if (n > skb->len) + /* extend head space */ + err = bpf_skb_change_head(skb, n - skb->len, 0); + else if (n < skb->len) + /* shrink head space */ + err = bpf_skb_adjust_room(skb, -(skb->len - n), 0, 0); + + if (err) + return BPF_DROP; + + data = (char *)(long)skb->data; + data_end = (char *)(long)skb->data_end; + p = data; + /* 3 is '$' and "\r\n"*/ + if (p + i + 3 > data_end) + return BPF_DROP; + + *p++ = '$'; + while (p < data_end && --i >= 0) + *p++ = buf[i]; + *p++ = '\r'; + *p++ = '\n'; + + n = val->len; + if (n == 0 || n > sizeof(val->data) || p + n + 2 > data_end) + return BPF_DROP; + + if (bmc_copy_to_skb(skb, p - data, val->data, + val->data + sizeof(val->data), n)) + return BPF_DROP; + + p += n; + char end_mark[] = { '\r', '\n'}; + + bpf_skb_store_bytes(skb, p - data, end_mark, sizeof(end_mark), 0); + + return BPF_REDIRECT; +} + +static int do_redis_set_handler(struct __sk_buff *skb, struct redis_ctx *ctx) +{ + int err; + u32 off = 0; + u32 value_len; + char *data = (char *)(long)skb->data; + char *data_end = (char *)(long)skb->data_end; + + if (data + 1 > data_end || data[0] != '$') + return BPF_OK; + off++; + data++; + + value_len = 0; + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + value_len = value_len * 10 + data[0] - '0'; + off++; + data++; + } + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + value_len = value_len * 10 + data[0] - '0'; + off++; + data++; + } + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + value_len = value_len * 10 + data[0] - '0'; + off++; + data++; + } + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + value_len = value_len * 10 + data[0] - '0'; + off++; + data++; + } + + if (data + 2 > data_end || data[0] != '\r' || data[1] != '\n') + return BPF_OK; + off += 2; + data += 2; + + if (data > data_end) + return BPF_OK; + + /* format error */ + if (value_len <= 0 || value_len > sizeof(ctx->value.data) || + value_len >= data_end - data) { + return BPF_OK; + } + + if (bmc_copy_from_skb(ctx->value.data, + ctx->value.data + sizeof(ctx->value.data), + skb, off, value_len)) + return BPF_OK; + + ctx->value.len = value_len; + + if (bpf_map_update_elem(&bmc_storage, &ctx->key, &ctx->value, BPF_ANY)) { + bpf_map_delete_elem(&bmc_storage, &ctx->key); + return BPF_OK; + } + + char reply[] = { '+', 'O', 'K', '\r', '\n'}; + + if (skb->len < sizeof(reply)) + /* extend head space */ + err = bpf_skb_change_head(skb, sizeof(reply) - skb->len, 0); + else + /* shrink head space */ + err = bpf_skb_adjust_room(skb, -(skb->len - sizeof(reply)), 0, 0); + + if (err) + return BPF_OK; + + bpf_skb_store_bytes(skb, 0, reply, sizeof(reply), 0); + + return BPF_REDIRECT; +} + +SEC("bmc/redis_get_handler") +int redis_get_handler(struct __sk_buff *skb) +{ + int err; + struct redis_bmc_stat *stat; + struct redis_ctx *ctx; + + stat = get_stat(); + if (!stat) + return SK_PASS; + + stat->total_get_requests++; + + ctx = get_ctx(); + if (!ctx) + return SK_PASS; + + err = do_redis_get_handler(skb, ctx); + if (err == BPF_REDIRECT) { + stat->hit_get_requests++; + return sock_redirect(skb); + } + + if (err == BPF_DROP) { + stat->drop_get_requests++; + return SK_DROP; + } + + return SK_PASS; +} + +SEC("bmc/redis_set_handler") +int redis_set_handler(struct __sk_buff *skb) +{ + int err; + struct redis_bmc_stat *stat; + struct redis_ctx *ctx; + + stat = get_stat(); + if (!stat) + return SK_PASS; + + stat->total_set_requests++; + + ctx = get_ctx(); + if (!ctx) + return SK_PASS; + + err = do_redis_set_handler(skb, ctx); + if (err == BPF_REDIRECT) { + stat->hit_set_requests++; + return sock_redirect(skb); + } + + if (err == BPF_DROP) { + stat->drop_set_requests++; + return SK_DROP; + } + + err = bpf_skb_adjust_room(skb, ctx->offset, 0, 0); + if (!err) + return SK_PASS; + + stat->drop_set_requests++; + return SK_DROP; +} + +static inline int bmc_process(struct __sk_buff *skb) +{ + u32 off; + int err; + u32 key_len; + char *data; + char *data_end; + int expect_get = 0; + int is_get = 0; + struct redis_ctx *ctx; + + ctx = get_ctx(); + if (ctx == NULL) + return SK_PASS; + + err = bpf_skb_pull_data(skb, skb->len); + if (err) + return SK_PASS; + + off = 0; + data = (char *)(long)skb->data; + data_end = (char *)(long)skb->data_end; + + /* + * SET message format: + * "*3\r\n" // this is an array with 3 elements + * "$3\r\n" // the first element is a string with 3 characters + * "set\r\n" // the string is "set" + * "$5\r\n" // the second element is a string with 5 characters + * "key01\r\n" // the string is "key01" + * "$5\r\n" // the third element is a string with 5 characters + * "val01\r\n" // the string is "valu01" + * + * GET message format: + * "*2\r\n" // this is an array with 3 elements + * "$3\r\n" // the first element is a string with 3 characters + * "get\r\n" // the string is "get" + * "$5\r\n" // the second element is a string with 5 characters + * "key01\r\n" // the string is "key01" + */ + if (data + 4 > data_end) + return SK_PASS; + + /* Not GET, Not SET */ + if (data[0] != '*' || (data[1] != '2' && data[1] != '3') || + data[2] != '\r' || data[3] != '\n') + return SK_PASS; + + expect_get = (data[1] == '2'); + off += 4; + data += 4; + + if (data + 4 > data_end) + return SK_PASS; + + if (data[0] != '$' || data[1] != '3' || data[2] != '\r' || + data[3] != '\n') + return SK_PASS; + + off += 4; + data += 4; + + if (data + 5 > data_end) + return SK_PASS; + + switch (data[0]) { + case 'g': + is_get = 1; + case 's': + if (data[1] != 'e' || data[2] != 't' || + data[3] != '\r' || data[4] != '\n') + return SK_PASS; + break; + case 'G': + is_get = 1; + case 'S': + if (data[1] != 'E' || data[2] != 'T' || + data[3] != '\r' || data[4] != '\n') + return SK_PASS; + break; + default: + return SK_PASS; + } + off += 5; + data += 5; + + if (expect_get != is_get) + return SK_PASS; + + if (data + 1 > data_end || data[0] != '$') + return SK_PASS; + off++; + data++; + + key_len = 0; + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + key_len = key_len * 10 + data[0] - '0'; + off++; + data++; + } + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + key_len = key_len * 10 + data[0] - '0'; + off++; + data++; + } + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + key_len = key_len * 10 + data[0] - '0'; + off++; + data++; + } + if (data < data_end && data[0] >= '0' && data[0] <= '9') { + key_len = key_len * 10 + data[0] - '0'; + off++; + data++; + } + if (data + 2 > data_end || data[0] != '\r' || data[1] != '\n') + return SK_PASS; + off += 2; + data += 2; + + if (data > data_end) + return SK_PASS; + + if (key_len == 0 || key_len > sizeof(ctx->key.data) || + key_len >= data_end - data) + return SK_PASS; + + ctx->offset = off + key_len + 2; + ctx->key.len = key_len; + + if (bmc_copy_from_skb(ctx->key.data, + ctx->key.data + sizeof(ctx->key.data), + skb, off, key_len)) + return SK_PASS; + + if (is_get) { + bpf_tail_call(skb, &bmc_jump_table, REDIS_GET_PROG_INDEX); + } else { + err = bpf_skb_adjust_room(skb, -ctx->offset, 0, 0); + if (err) + return SK_PASS; + bpf_tail_call(skb, &bmc_jump_table, REDIS_SET_PROG_INDEX); + } + return SK_PASS; +} diff --git a/samples/bpf/bmc/tool.c b/samples/bpf/bmc/tool.c new file mode 100644 index 000000000000..e45be64a2819 --- /dev/null +++ b/samples/bpf/bmc/tool.c @@ -0,0 +1,733 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <errno.h> + +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/select.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <fcntl.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "common.h" + +#define DEFAULT_CGROUP_PATH "/sys/fs/cgroup" +#define DEFAULT_REDIS_PORT 6379 + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#endif + +struct { + char *cgroup_path; + char *bpf_path; + int cgroup_fd; + int map_socks_fd; + int map_ports_fd; + int map_storage_fd; + int map_jump_table_fd; + int map_stats_fd; + int sock_parser_prog_fd; + int sock_verdict_prog_fd; + int sock_ops_prog_fd; + int redis_get_prog_fd; + int redis_set_prog_fd; + uint16_t listen_port; +} bmc; + +struct bmc_prog_info { + const char *sec_name; + enum bpf_prog_type prog_type; + enum bpf_attach_type attach_type; + int *p_prog_fd; + int *p_attach_fd; + unsigned int attach_flags; + const char *pin_path; + struct bpf_program *prog; +}; + +struct bmc_map_info { + const char *map_name; + int *p_map_fd; + char *pin_path; + struct bpf_map *map; + bool is_stat_map; +}; + +static struct bmc_prog_info prog_infos[] = { + { + .sec_name = "bmc/sock_parser", + .prog_type = BPF_PROG_TYPE_SK_SKB, + .attach_type = BPF_SK_SKB_STREAM_PARSER, + .p_prog_fd = &bmc.sock_parser_prog_fd, + .p_attach_fd = &bmc.map_socks_fd, + .attach_flags = 0, + .pin_path = "/sys/fs/bpf/bmc/prog_sock_parser" + }, + { + .sec_name = "bmc/sock_verdict", + .prog_type = BPF_PROG_TYPE_SK_SKB, + .attach_type = BPF_SK_SKB_STREAM_VERDICT, + .p_prog_fd = &bmc.sock_verdict_prog_fd, + .p_attach_fd = &bmc.map_socks_fd, + .attach_flags = 0, + .pin_path = "/sys/fs/bpf/bmc/prog_sock_verdict" + }, + { + .sec_name = "bmc/sock_ops", + .prog_type = BPF_PROG_TYPE_SOCK_OPS, + .attach_type = BPF_CGROUP_SOCK_OPS, + .p_prog_fd = &bmc.sock_ops_prog_fd, + .p_attach_fd = &bmc.cgroup_fd, + .attach_flags = 0, + .pin_path = "/sys/fs/bpf/bmc/prog_sock_ops" + }, + { + .sec_name = "bmc/redis_get_handler", + .prog_type = BPF_PROG_TYPE_SK_SKB, + .p_prog_fd = &bmc.redis_get_prog_fd, + .p_attach_fd = NULL, + .attach_flags = 0, + .pin_path = "/sys/fs/bpf/bmc/prog_redis_get_handler" + + }, + { + .sec_name = "bmc/redis_set_handler", + .prog_type = BPF_PROG_TYPE_SK_SKB, + .p_prog_fd = &bmc.redis_set_prog_fd, + .p_attach_fd = NULL, + .attach_flags = 0, + .pin_path = "/sys/fs/bpf/bmc/prog_redis_set_handler" + + } +}; + +static struct bmc_map_info map_infos[] = { + { + .map_name = "bmc_socks", + .p_map_fd = &bmc.map_socks_fd, + .pin_path = "/sys/fs/bpf/bmc/map_socks" + }, + { + .map_name = "bmc_ports", + .p_map_fd = &bmc.map_ports_fd, + .pin_path = "/sys/fs/bpf/bmc/map_ports" + }, + { + .map_name = "bmc_storage", + .p_map_fd = &bmc.map_storage_fd, + .pin_path = "/sys/fs/bpf/bmc/map_storage" + }, + { + .map_name = "bmc_jump_table", + .p_map_fd = &bmc.map_jump_table_fd, + .pin_path = "/sys/fs/bpf/bmc/map_jump_table" + }, + { + .map_name = "bmc_stats", + .p_map_fd = &bmc.map_stats_fd, + .pin_path = "/sys/fs/bpf/bmc/stats", + .is_stat_map = true, + }, +}; + +static int find_type_by_sec_name(const char *sec_name, + enum bpf_prog_type *p_prog_type, + enum bpf_attach_type *p_attach_type) +{ + int i; + + if (sec_name == NULL) { + fprintf(stderr, "sec_name is NULL\n"); + return -1; + } + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + if (!strcmp(prog_infos[i].sec_name, sec_name)) { + *p_prog_type = prog_infos[i].prog_type; + *p_attach_type = prog_infos[i].attach_type; + return 0; + } + } + + fprintf(stderr, "unknown prog %s\n", sec_name); + + return -1; +} + +static int set_prog_type(struct bpf_object *obj) +{ + const char *sec_name; + struct bpf_program *prog; + enum bpf_prog_type prog_type; + enum bpf_attach_type attach_type; + + bpf_object__for_each_program(prog, obj) { + sec_name = bpf_program__section_name(prog); + if (find_type_by_sec_name(sec_name, &prog_type, &attach_type)) + return -1; + bpf_program__set_type(prog, prog_type); + bpf_program__set_expected_attach_type(prog, attach_type); + } + + return 0; +} + +static struct bpf_object *load_bpf_file(const char *bpf_file) +{ + int err; + char err_buf[256]; + struct bpf_object *obj; + + obj = bpf_object__open(bpf_file); + err = libbpf_get_error(obj); + if (err) { + libbpf_strerror(err, err_buf, sizeof(err_buf)); + fprintf(stderr, "unable to open bpf file %s : %s\n", bpf_file, + err_buf); + return NULL; + } + + if (set_prog_type(obj)) { + bpf_object__close(obj); + return NULL; + } + + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "load bpf object failed\n"); + bpf_object__close(obj); + return NULL; + } + + return obj; +} + +static int find_prog(struct bpf_object *obj, const char *sec_name, + struct bpf_program **p_prog, int *p_prog_fd) +{ + int fd; + struct bpf_program *prog; + + prog = bpf_object__find_program_by_title(obj, sec_name); + if (!prog) { + fprintf(stderr, "failed to find prog %s\n", sec_name); + return -1; + } + + fd = bpf_program__fd(prog); + if (fd < 0) { + fprintf(stderr, "failed to get fd of prog %s\n", sec_name); + return -1; + } + + + *p_prog = prog; + *p_prog_fd = fd; + + return 0; +} + +static void unpin_progs(int n) +{ + int i; + + for (i = 0; i < n; i++) + bpf_program__unpin(prog_infos[i].prog, prog_infos[i].pin_path); +} + +static int find_progs(struct bpf_object *obj) +{ + int i; + struct bmc_prog_info *info; + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + info = &prog_infos[i]; + + if (find_prog(obj, info->sec_name, &info->prog, info->p_prog_fd)) + goto error_find_prog; + + if (bpf_program__pin(info->prog, info->pin_path)) + goto error_find_prog; + } + + return 0; + +error_find_prog: + unpin_progs(i); + return -1; +} + +static int find_map(struct bpf_object *obj, const char *map_name, + struct bpf_map **p_map, int *p_map_fd) +{ + int fd; + struct bpf_map *map; + + map = bpf_object__find_map_by_name(obj, map_name); + if (!map) { + fprintf(stderr, "failed to find map %s\n", map_name); + return -1; + } + + fd = bpf_map__fd(map); + if (fd < 0) { + fprintf(stderr, "failed to get fd of map %s\n", map_name); + return -1; + } + + + *p_map = map; + *p_map_fd = fd; + + return 0; +} + +static void unpin_maps(int n) +{ + int i; + + for (i = 0; i < n; i++) + bpf_map__unpin(map_infos[i].map, map_infos[i].pin_path); +} + +static int find_maps(struct bpf_object *obj) +{ + int i; + struct bmc_map_info *info; + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) { + info = &map_infos[i]; + + if (find_map(obj, info->map_name, &info->map, info->p_map_fd)) + goto error_find_map; + + if (bpf_map__pin(info->map, info->pin_path)) { + fprintf(stderr, "failed to pin map %s to path %s\n", + info->map_name, info->pin_path); + goto error_find_map; + } + } + + return 0; + +error_find_map: + unpin_maps(i); + return -1; +} + +static void detach_progs(int n) +{ + int i; + struct bmc_prog_info *info; + + for (i = 0; i < n; i++) { + info = &prog_infos[i]; + bpf_prog_detach(*info->p_prog_fd, info->attach_type); + } +} + +static int attach_progs(struct bpf_object *obj) +{ + int i; + int prog_fd; + int attach_fd; + unsigned int flags; + enum bpf_attach_type type; + struct bmc_prog_info *info; + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + info = &prog_infos[i]; + if (!info->p_attach_fd) + continue; + prog_fd = *info->p_prog_fd; + attach_fd = *info->p_attach_fd; + type = info->attach_type; + flags = info->attach_flags; + + if (bpf_prog_attach(prog_fd, attach_fd, type, flags)) { + fprintf(stderr, "attach prog %s failed!\n", + info->sec_name); + goto error_attach_prog; + } + } + + return 0; + +error_attach_prog: + detach_progs(i); + + return -1; +} + +static int add_bmc_port(void) +{ + int ret; + int map_fd = bmc.map_ports_fd; + uint16_t port = htons(bmc.listen_port); + uint32_t key = (uint32_t)port; + uint32_t value = 1; + + ret = bpf_map_update_elem(map_fd, &key, &value, 0); + if (ret) + fprintf(stderr, "failed to add port %u\n", port); + + return ret; +} + +static int add_tail_call(void) +{ + int ret; + int map_fd = bmc.map_jump_table_fd; + __u32 key; + __u32 value; + + key = REDIS_GET_PROG_INDEX; + value = bmc.redis_get_prog_fd; + ret = bpf_map_update_elem(map_fd, &key, &value, 0); + if (ret) { + fprintf(stderr, "failed to add redis get tail call prog\n"); + return -1; + } + + key = REDIS_SET_PROG_INDEX; + value = bmc.redis_set_prog_fd; + ret = bpf_map_update_elem(map_fd, &key, &value, 0); + if (ret) { + fprintf(stderr, "failed to add redis set tail call prog\n"); + key = REDIS_GET_PROG_INDEX; + bpf_map_delete_elem(map_fd, &key); + } + + return ret; +} + +static int setup_bpf(void) +{ + struct bpf_object *obj; + + bmc.cgroup_fd = open(bmc.cgroup_path, O_DIRECTORY, O_RDONLY); + if (bmc.cgroup_fd < 0) { + fprintf(stderr, "failed to open cgroup %s: %s\n", + bmc.cgroup_path, strerror(errno)); + return -1; + } + + obj = load_bpf_file(bmc.bpf_path); + if (!obj) + goto error_load_object; + + if (find_progs(obj)) + goto error_load_object; + + if (find_maps(obj)) + goto error_find_maps; + + if (attach_progs(obj)) + goto error_attach_progs; + + if (add_bmc_port()) + goto error_add_port; + + if (add_tail_call()) + goto error_attach_progs; + + return 0; + +error_add_port: + detach_progs(ARRAY_SIZE(prog_infos)); +error_attach_progs: + unpin_maps(ARRAY_SIZE(map_infos)); +error_find_maps: + unpin_progs(ARRAY_SIZE(prog_infos)); +error_load_object: + bpf_object__close(obj); + close(bmc.cgroup_fd); + return -1; +} + +static int parse_load_args(int argc, char *argv[]) +{ + int opt; + int port; + + bmc.cgroup_path = DEFAULT_CGROUP_PATH; + bmc.listen_port = DEFAULT_REDIS_PORT; + + while ((opt = getopt(argc, argv, "c:p:")) != -1) { + switch (opt) { + case 'c': + bmc.cgroup_path = optarg; + break; + case 'p': + port = atoi(optarg); + if (port <= 0 || port >= USHRT_MAX) { + fprintf(stderr, "invalid port: %s\n", optarg); + return -1; + } + bmc.listen_port = port; + break; + default: + fprintf(stderr, "unknown option %c\n", opt); + return -1; + } + } + + if (optind >= argc) { + fprintf(stderr, "no bpf prog file found\n"); + return -1; + } + + bmc.bpf_path = argv[optind]; + + printf("bpf file: %s\n", bmc.bpf_path); + printf("cgroup path: %s\n", bmc.cgroup_path); + printf("listen port: %d\n", bmc.listen_port); + + return 0; +} + +struct cmd { + const char *name; + int (*func)(int argc, char *argv[]); +}; + +static int do_prog(int argc, char *argv[]); +static int do_stat(int argc, char *argv[]); + +static int do_prog_load(int argc, char *argv[]); +static int do_prog_unload(int argc, char *argv[]); + +static struct cmd main_cmds[] = { + { "prog", do_prog }, + { "stat", do_stat }, +}; + +static struct cmd prog_cmds[] = { + { "load", do_prog_load }, + { "unload", do_prog_unload }, +}; + +static char *elf_name; + +static int dispatch_cmd(struct cmd cmds[], int ncmd, int argc, + char *argv[], void (*help)(void)) +{ + int i; + int ret; + + if (argc <= 0) { + help(); + return -1; + } + + for (i = 0; i < ncmd; i++) { + if (!strcmp(argv[0], cmds[i].name)) { + ret = cmds[i].func(argc - 1, argv + 1); + if (ret == -2) { + help(); + ret = -1; + } + return ret; + } + } + + help(); + + return -1; +} + +static int do_prog_load(int argc, char *argv[]) +{ + if (parse_load_args(argc + 1, argv - 1) < 0) + return -2; + + if (setup_bpf()) + return -1; + + return 0; +} + +static int do_prog_unload(int argc, char *argv[]) +{ + int i; + int prog_fd; + int cgroup_fd; + char *cgroup_path = DEFAULT_CGROUP_PATH; + + if (argc > 1) + cgroup_path = argv[0]; + + cgroup_fd = open(cgroup_path, O_DIRECTORY, O_RDONLY); + if (cgroup_fd < 0) { + fprintf(stderr, "failed to open cgroup path: %s\n", + cgroup_path); + return -1; + } + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + if (prog_infos[i].attach_type == BPF_CGROUP_SOCK_OPS) { + prog_fd = bpf_obj_get(prog_infos[i].pin_path); + if (prog_fd >= 0) + bpf_prog_detach2(prog_fd, cgroup_fd, + BPF_CGROUP_SOCK_OPS); + } + unlink(prog_infos[i].pin_path); + } + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) + unlink(map_infos[i].pin_path); + + return 0; +} + +static void do_prog_help(void) +{ + fprintf(stderr, + "Usage: %s prog load [-c CGROUP_PATH] [-p LISTEN_PORT] {BPF_FILE}\n" + " %s prog unload [CGROUP_PATH]\n", + elf_name, elf_name); +} + +static int do_prog(int argc, char *argv[]) +{ + return dispatch_cmd(prog_cmds, ARRAY_SIZE(prog_cmds), + argc, argv, do_prog_help); +} + +static int do_stat(int argc, char *argv[]) +{ + int i; + int fd; + int err; + int ncpu; + bool found = false; + struct bmc_map_info *info; + struct bpf_map_info map = {}; + struct redis_bmc_stat stat = {}; + __u32 len = sizeof(map); + + ncpu = sysconf(_SC_NPROCESSORS_ONLN); + if (ncpu < 0) { + fprintf(stderr, "sysconf failed: %s\n", strerror(errno)); + return -1; + } + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) { + info = &map_infos[i]; + if (info->is_stat_map) { + found = true; + break; + } + } + + if (!found) { + fprintf(stderr, "no stats map found\n"); + return -1; + } + + fd = bpf_obj_get(info->pin_path); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", + info->pin_path); + return -1; + } + + err = bpf_obj_get_info_by_fd(fd, &map, &len); + if (err) { + fprintf(stderr, "failed to get map info\n"); + goto error; + } + + if (map.type != BPF_MAP_TYPE_ARRAY) { + fprintf(stderr, "unexpected map type: %d\n", map.type); + goto error; + } + + if (map.key_size != sizeof(__u32)) { + fprintf(stderr, "unexpected map key_size: %u\n", map.key_size); + goto error; + } + + if (map.value_size != sizeof(struct redis_bmc_stat)) { + fprintf(stderr, "unexpected map key_size: %u\n", map.key_size); + goto error; + } + + for (int i = 0; i < ncpu; i++) { + __u32 key = i; + struct redis_bmc_stat value; + + err = bpf_map_lookup_elem(fd, &key, &value); + if (err) { + fprintf(stderr, "lookup cpu stat failed, cpu=%u\n", i); + goto error; + } + stat.total_get_requests += value.total_get_requests; + stat.hit_get_requests += value.hit_get_requests; + stat.drop_get_requests += value.drop_get_requests; + stat.total_set_requests += value.total_set_requests; + stat.hit_set_requests += value.hit_set_requests; + stat.drop_set_requests += value.drop_set_requests; + } + + printf("Total GET Requests: %llu\n", stat.total_get_requests); + printf("Hit GET Requests: %llu (%.2f%%)\n", stat.hit_get_requests, + stat.total_get_requests == 0 ? 0 : + (double)stat.hit_get_requests / + (double)stat.total_get_requests * + 100); + printf("Dropped GET Requests: %llu (%.2lf%%)\n", stat.drop_get_requests, + stat.total_get_requests == 0 ? 0 : + (double)stat.drop_get_requests / + (double)stat.total_get_requests * + 100); + + printf("Total SET Requests: %llu\n", stat.total_set_requests); + printf("Hit SET Requests: %llu (%.2f%%)\n", stat.hit_set_requests, + stat.total_set_requests == 0 ? 0 : + (double)stat.hit_set_requests / + (double)stat.total_set_requests * + 100); + printf("Dropped SET Requests: %llu (%.2lf%%)\n", stat.drop_set_requests, + stat.total_set_requests == 0 ? 0 : + (double)stat.drop_set_requests / + (double)stat.total_set_requests * + 100); + + close(fd); + + return 0; + +error: + close(fd); + return -1; +} + +static void do_main_help(void) +{ + fprintf(stderr, + "Usage: %s OBJECT { COMMAND | help }\n" + " OBJECT := { prog | stat }\n", + elf_name); +} + +int main(int argc, char *argv[]) +{ + elf_name = argv[0]; + + return dispatch_cmd(main_cmds, ARRAY_SIZE(main_cmds), + argc - 1, argv + 1, do_main_help); +}