From: Xu Kuohai xukuohai@huawei.com
hulk inclusion category: feature bugzilla: N/A
--------------------------------
BMC is an in-kernel key-value cache implemented in BPF and proposed by paper [1]. The paper discussed BMC for memcached, obtaining at least 6x performance speedup.
This patch implements a sample BMC for Redis.
See [2] for details on how to build samples/bpf.
Output files: samples/bpf/bmctool samples/bpf/bmc/bpf.o
Sample usage: bmctool prog load -p 6379 ./bmc/bpf.o # load bmc bpf prog and attach it # to sockets with listen port 6379
bmctool stat # dump bmc status
bmctool prog unload # detach and unload bmc prog
Tested with the following command:
./redis-benchmark -c 20 -r 1 -n 1000 -t get -h 192.168.4.101 -d 102
Without BMC: throughput summary: 41666.67 requests per second latency summary (msec): avg min p50 p95 p99 max 0.441 0.176 0.415 0.631 1.455 1.815
With BMC (100% HIT): throughput summary: 66666.67 requests per second latency summary (msec): avg min p50 p95 p99 max 0.223 0.096 0.215 0.311 0.743 0.759
BMC Stat: Total GET Requests: 1000 Hit GET Requests: 1000 (100.00%) Dropped GET Requests: 0 (0.00%) Total SET Requests: 1 Hit SET Requests: 1 (100.00%) Dropped SET Requests: 0 (0.00%)
[1] https://www.usenix.org/conference/nsdi21/presentation/ghigoff [2] https://www.kernel.org/doc/readme/samples-bpf-README.rst
Signed-off-by: Xu Kuohai xukuohai@huawei.com Signed-off-by: Yang Jihong yangjihong@huawei.com Signed-off-by: He Fengqing hefengqing@huawei.com (original demo) --- samples/bpf/Makefile | 3 + samples/bpf/bmc/bpf.c | 485 +++++++++++++++++++++++++ samples/bpf/bmc/common.h | 21 ++ samples/bpf/bmc/tool.c | 763 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 1272 insertions(+) create mode 100644 samples/bpf/bmc/bpf.c create mode 100644 samples/bpf/bmc/common.h create mode 100644 samples/bpf/bmc/tool.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index aeebf5d12f32..639767d587a1 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -54,6 +54,7 @@ tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y := bmctool
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -111,6 +112,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) +bmctool-objs := bmc/tool.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -172,6 +174,7 @@ always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o +always-y += bmc/bpf.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/bmc/bpf.c b/samples/bpf/bmc/bpf.c new file mode 100644 index 000000000000..154bc5665446 --- /dev/null +++ b/samples/bpf/bmc/bpf.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Description: BPF program to accelerate Redis. The idea is to add a kernel + * cache for Redis data. When new Redis request is received, the kernel cache + * is checked, and if the requested data is found in the cache, a Redis reply + * message is constructed and sent back directly. + */ + +#include <uapi/linux/in.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/pkt_cls.h> + +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> + +#include "common.h" + +#define BMC_MAX_REDIS_KEY_LEN 64 +#define BMC_MAX_REDIS_VALUE_LEN 128 + +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 16); +} bmc_ports SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 1); +} bmc_interface SEC(".maps"); + +struct redis_key { + u32 len; + /* encoded in redis format */ + u8 data[BMC_MAX_REDIS_KEY_LEN + 16]; +}; + +struct redis_value { + u32 len; + /* encoded in redis format */ + u8 data[BMC_MAX_REDIS_VALUE_LEN + 16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(key_size, sizeof(struct redis_key)); + __uint(value_size, sizeof(struct redis_value)); + __uint(max_entries, 10000); +} bmc_storage SEC(".maps"); + +struct redis_ctx { + struct redis_key key; + struct redis_value value; + u32 offset; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct redis_ctx)); + __uint(max_entries, 1); +} ctxmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct redis_bmc_stat)); + __uint(max_entries, 1); +} bmc_stats SEC(".maps"); + +static inline struct redis_ctx *get_ctx(void) +{ + u32 key = 0; + + return bpf_map_lookup_elem(&ctxmap, &key); +} + +static inline struct redis_bmc_stat *get_stat(void) +{ + u32 key = 0; + + return bpf_map_lookup_elem(&bmc_stats, &key); +} + +static bool is_bmc_port(u32 port) +{ + u32 *val = bpf_map_lookup_elem(&bmc_ports, &port); + + return val != NULL && *val != 0; +} + +static inline void compute_ip_checksum(struct iphdr *ip) +{ + u32 csum = 0; + u16 *next_ip_u16 = (u16 *)ip; + + ip->check = 0; + +#pragma clang loop unroll(full) + for (int i = 0; i < (sizeof(*ip) >> 1); i++) + csum += *next_ip_u16++; + + ip->check = ~((csum & 0xffff) + (csum >> 16)); +} + +static inline void compute_tcp_checksum(struct iphdr *ip, struct tcphdr *tcp, + __u16 len, void *data_end) +{ + struct tcp_psedu_head { + __be32 saddr; + __be32 daddr; + __u8 zero; + __u8 proto; + __u16 tcplen; + }; + struct tcp_psedu_head psedu; + char *tail = NULL; + char left_over[2] = {0}; + + psedu.saddr = ip->saddr; + psedu.daddr = ip->daddr; + psedu.zero = 0; + psedu.proto = 6; + psedu.tcplen = bpf_htons(len); + + tcp->check = 0; + + u32 csum = 0; + u16 *next_u16 = (u16 *)&psedu; + unsigned int i; + +#pragma clang loop unroll(full) + for (i = 0; i < (sizeof(struct tcp_psedu_head) >> 1); i++) + csum += *next_u16++; + + next_u16 = (u16 *)tcp; + for (i = 0; i < 1024 && (i < len / 2); i++) { + if (next_u16 + 1 > data_end) + break; + csum += *next_u16++; + } + + if (len % 2 == 1) { + tail = (char *)next_u16; + if (tail < data_end) + left_over[0] = *tail; + csum += *(unsigned short *)left_over; + } + csum = (csum >> 16) + (csum & 0xffff); /* add in accumulated carries */ + csum += csum >> 16; /* add potential last carry */ + + tcp->check = (0xffff & ~csum); +} + +#define extract_kvdata(field, size, kv_data, kv_len) \ +do { \ + kv_data = payload; \ + kv_len = 0; \ + \ + if (payload + 1 > data_end || payload[0] != '$') \ + return XDP_PASS; \ + \ + payload++; \ + if (payload < data_end && payload[0] >= '0' && payload[0] <= '9') { \ + kv_len = kv_len * 10 + (payload[0] - '0'); \ + payload++; \ + } \ + \ + if (payload < data_end && payload[0] >= '0' && payload[0] <= '9') { \ + kv_len = kv_len * 10 + (payload[0] - '0'); \ + payload++; \ + } \ + \ + if (payload < data_end && payload[0] >= '0' && payload[0] <= '9') { \ + kv_len = kv_len * 10 + (payload[0] - '0'); \ + payload++; \ + } \ + \ + if (payload < data_end && payload[0] >= '0' && payload[0] <= '9') { \ + kv_len = kv_len * 10 + (payload[0] - '0'); \ + payload++; \ + } \ + \ + if (payload + 2 > data_end || payload[0] != '\r' || payload[1] != '\n') \ + return XDP_PASS; \ + \ + payload += 2; \ + \ + if (kv_len == 0 || kv_len > size) \ + return XDP_PASS; \ + \ + payload += kv_len + 2; \ + kv_len = payload - kv_data; \ + \ + if (kv_len > sizeof(ctx->field.data)) \ + return XDP_PASS; \ + \ + bpf_xdp_load_bytes(xdp, kv_data - data, ctx->field.data, kv_len); \ + ctx->field.len = kv_len; \ +} while (0) + +#define adjust_xdp_tail(size, len) \ +do { \ + char *new_end; \ + \ + new_end = payload = (char *)thdr + thdr->doff * 4; \ + for (i = 0; i < size && i < len; i++) \ + new_end++; \ + \ + if (new_end > data_end) \ + err = bpf_xdp_adjust_tail(xdp, new_end - data_end); \ + else if (new_end < data_end) \ + err = bpf_xdp_adjust_tail(xdp, -(data_end - new_end)); \ + \ + if (err) \ + return XDP_PASS; \ +} while (0) + +#define sync_tcp_seq(len, ndrop) \ +do { \ + struct bpf_sock_tuple tuple; \ + \ + tuple.ipv4.saddr = ihdr->saddr; \ + tuple.ipv4.daddr = ihdr->daddr; \ + tuple.ipv4.sport = thdr->source; \ + tuple.ipv4.dport = thdr->dest; \ + \ + tuple.seq = __bpf_ntohl(thdr->seq); \ + tuple.delta = __bpf_ntohs(ihdr->tot_len) - ihlen - thlen; \ + tuple.ack_seq = __bpf_ntohs(thdr->ack_seq) + len; \ + \ + if (bpf_update_tcp_seq(xdp, &tuple, sizeof(tuple.ipv4), -1, 0)) { \ + ndrop++; \ + return XDP_DROP; \ + } \ +} while (0) + +#define build_reply_head(len) \ +do { \ + thdr->doff = 5; /* discard tcp options */ \ + port = thdr->source; \ + thdr->source = thdr->dest; \ + thdr->dest = port; \ + \ + seq = __bpf_ntohl(thdr->seq); \ + seq += __bpf_ntohs(ihdr->tot_len) - ihlen - thlen; \ + thdr->seq = thdr->ack_seq; \ + thdr->ack_seq = __bpf_ntohl(seq); \ + \ + ipaddr = ihdr->saddr; \ + ihdr->saddr = ihdr->daddr; \ + ihdr->daddr = ipaddr; \ + ihdr->tot_len = __bpf_htons(ihlen + thdr->doff * 4 + len); \ + \ + memcpy(macaddr, ehdr->h_source, ETH_ALEN); \ + memcpy(ehdr->h_source, ehdr->h_dest, ETH_ALEN); \ + memcpy(ehdr->h_dest, macaddr, ETH_ALEN); \ +} while (0) + +SEC("bmc/main") +int bmc_main(struct xdp_md *xdp) +{ + int err; + u32 klen; + u32 vlen; + unsigned int i; + unsigned int seq; + u8 macaddr[ETH_ALEN]; + __be32 ipaddr; + __le16 port; + char *data = (char *)(long)xdp->data; + char *data_end = (char *)(long)xdp->data_end; + struct ethhdr *ehdr = NULL; + struct iphdr *ihdr = NULL; + struct tcphdr *thdr = NULL; + unsigned int ihlen; + unsigned int thlen; + char *payload; + u32 offset; + int is_get = 0; + int expect_get = 0; + struct redis_ctx *ctx; + struct redis_bmc_stat *stat; + char *key_data; + char *value_data; + u32 key_len; + u32 value_len; + + ehdr = (struct ethhdr *)data; + if (ehdr + 1 > data_end) + return XDP_PASS; + + if (ehdr->h_proto != __bpf_constant_htons(ETH_P_IP)) + return XDP_PASS; + + ihdr = (struct iphdr *)(ehdr + 1); + if (ihdr + 1 > data_end) + return XDP_PASS; + + if (ihdr->ihl != 5 || ihdr->protocol != IPPROTO_TCP) + return XDP_PASS; + + ihlen = ihdr->ihl * 4; + + if (ihdr->frag_off & __bpf_htons(IP_MF | IP_OFFSET)) + return XDP_PASS; + + if (__bpf_htons(ihdr->tot_len) > ETH_DATA_LEN) + return XDP_PASS; + + thdr = (struct tcphdr *)(ihdr + 1); + if (thdr + 1 > data_end) + return XDP_PASS; + + if (thdr->syn || thdr->fin || thdr->rst) + return XDP_PASS; + + if (!is_bmc_port(thdr->dest)) + return XDP_PASS; + + thlen = thdr->doff * 4; + payload = (void *)thdr + thlen; + + /* + * SET message format: + * "*3\r\n" // this is an array with 3 elements + * "$3\r\n" // the first element is a string with 3 characters + * "set\r\n" // the string is "set" + * "$5\r\n" // the second element is a string with 5 characters + * "key01\r\n" // the string is "key01" + * "$5\r\n" // the third element is a string with 5 characters + * "val01\r\n" // the string is "valu01" + * + * GET message format: + * "*2\r\n" // this is an array with 3 elements + * "$3\r\n" // the first element is a string with 3 characters + * "get\r\n" // the string is "get" + * "$5\r\n" // the second element is a string with 5 characters + * "key01\r\n" // the string is "key01" + */ + if (payload + 8 > data_end) + return XDP_PASS; + + if (payload[0] != '*' || (payload[1] != '2' && payload[1] != '3') || + payload[2] != '\r' || payload[3] != '\n' || payload[4] != '$' || + payload[5] != '3' || payload[6] != '\r' || payload[7] != '\n') + return XDP_PASS; + + expect_get = (payload[1] == '2'); + payload += 8; + + if (payload + 5 > data_end) + return XDP_PASS; + + switch (payload[0]) { + case 'g': + is_get = 1; + case 's': + if (payload[1] != 'e' || payload[2] != 't' || + payload[3] != '\r' || payload[4] != '\n') + return XDP_PASS; + break; + case 'G': + is_get = 1; + case 'S': + if (payload[1] != 'E' || payload[2] != 'T' || + payload[3] != '\r' || payload[4] != '\n') + return XDP_PASS; + break; + default: + return XDP_PASS; + } + + payload += 5; + + if (expect_get != is_get) + return XDP_PASS; + + ctx = get_ctx(); + if (!ctx) + return XDP_PASS; + + memset(ctx, 0, sizeof(*ctx)); + + stat = get_stat(); + if (!stat) + return XDP_PASS; + + extract_kvdata(key, BMC_MAX_REDIS_KEY_LEN, key_data, key_len); + + if (is_get) { + struct redis_value *val; + + stat->total_get_requests++; + + val = bpf_map_lookup_elem(&bmc_storage, &ctx->key); + if (!val || !val->len || val->len > sizeof(val->data)) + return XDP_PASS; + vlen = val->len; + + sync_tcp_seq(vlen, stat->drop_get_requests); + + build_reply_head(vlen); + + adjust_xdp_tail(BMC_MAX_REDIS_VALUE_LEN, vlen); + + data = (char *)(long)xdp->data; + data_end = (char *)(long)xdp->data_end; + + ihdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + thdr = (struct tcphdr *)(ihdr + 1); + if (ihdr + 1 > data_end || thdr + 1 > data_end) + return XDP_PASS; + + offset = sizeof(*ehdr) + ihdr->ihl * 4 + thdr->doff * 4; + bpf_xdp_store_bytes(xdp, offset, val->data, vlen); + + compute_ip_checksum(ihdr); + + compute_tcp_checksum(ihdr, thdr, vlen + thdr->doff * 4, + data_end); + + stat->hit_get_requests++; + + return XDP_TX; + } else { + char reply[] = { '+', 'O', 'K', '\r', '\n'}; + + stat->total_set_requests++; + + /* make sure the stupid verifier will not reject the prog */ + payload = key_data; + for (i = 0; i < sizeof(ctx->key.data) && i < key_len; i++) + payload++; + + extract_kvdata(value, BMC_MAX_REDIS_VALUE_LEN, value_data, + value_len); + + err = bpf_map_update_elem(&bmc_storage, &ctx->key, + &ctx->value, BPF_ANY); + if (err) + return XDP_PASS; + + sync_tcp_seq(sizeof(reply), stat->drop_set_requests); + + build_reply_head(sizeof(reply)); + + adjust_xdp_tail(sizeof(reply), sizeof(reply)); + + data = (char *)(long)xdp->data; + data_end = (char *)(long)xdp->data_end; + + ihdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + thdr = (struct tcphdr *)(ihdr + 1); + if (ihdr + 1 > data_end || thdr + 1 > data_end) + return XDP_PASS; + + offset = sizeof(*ehdr) + ihdr->ihl * 4 + thdr->doff * 4; + bpf_xdp_store_bytes(xdp, offset, reply, sizeof(reply)); + + compute_ip_checksum(ihdr); + + compute_tcp_checksum(ihdr, thdr, thdr->doff * 4 + sizeof(reply), + data_end); + + stat->hit_set_requests++; + + return XDP_TX; + } + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/bmc/common.h b/samples/bpf/bmc/common.h new file mode 100644 index 000000000000..51c8623ab4f8 --- /dev/null +++ b/samples/bpf/bmc/common.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * Description: common header for both user prog and bpf kernel prog + */ +#ifndef __REDIS_BMC_COMMON_H__ +#define __REDIS_BMC_COMMON_H__ + +#define REDIS_GET_PROG_INDEX 0 +#define REDIS_SET_PROG_INDEX 1 + +struct redis_bmc_stat { + __u64 total_get_requests; + __u64 hit_get_requests; + __u64 drop_get_requests; + __u64 total_set_requests; + __u64 hit_set_requests; + __u64 drop_set_requests; +}; + +#endif diff --git a/samples/bpf/bmc/tool.c b/samples/bpf/bmc/tool.c new file mode 100644 index 000000000000..f7889434bcd4 --- /dev/null +++ b/samples/bpf/bmc/tool.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <errno.h> + +#include <linux/if_link.h> + +#include <unistd.h> +#include <net/if.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/select.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <fcntl.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "common.h" + +#define DEFAULT_CGROUP_PATH "/sys/fs/cgroup" +#define DEFAULT_REDIS_PORT 6379 + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#endif + +#define IFINDEX_NUM 8 + +struct { + char *cgroup_path; + char *bpf_path; + int cgroup_fd; + int map_ports_fd; + int map_storage_fd; + int map_interface_fd; + int map_stats_fd; + int redis_xdp_main_prog_fd; + uint16_t listen_port; + unsigned int ifindex; +} bmc; + +struct bmc_prog_info { + const char *sec_name; + enum bpf_prog_type prog_type; + enum bpf_attach_type attach_type; + int *p_prog_fd; + int *p_attach_fd; + unsigned int attach_flags; + unsigned int is_xdp_main; + const char *pin_path; + struct bpf_program *prog; +}; + +struct bmc_map_info { + const char *map_name; + int *p_map_fd; + char *pin_path; + struct bpf_map *map; + bool is_stat_map; + bool is_interface_map; +}; + +static struct bmc_prog_info prog_infos[] = { + { + .sec_name = "bmc/main", + .prog_type = BPF_PROG_TYPE_XDP, + .p_prog_fd = &bmc.redis_xdp_main_prog_fd, + .attach_flags = XDP_FLAGS_DRV_MODE, // XDP_FLAGS_SKB_MODE + .is_xdp_main = 1, + .pin_path = "/sys/fs/bpf/bmc/prog_xdp_main" + } +}; + +static struct bmc_map_info map_infos[] = { + { + .map_name = "bmc_ports", + .p_map_fd = &bmc.map_ports_fd, + .pin_path = "/sys/fs/bpf/bmc/map_ports" + }, + { + .map_name = "bmc_storage", + .p_map_fd = &bmc.map_storage_fd, + .pin_path = "/sys/fs/bpf/bmc/map_storage" + }, + { + .map_name = "bmc_interface", + .p_map_fd = &bmc.map_interface_fd, + .pin_path = "/sys/fs/bpf/bmc/interface", + .is_interface_map = true, + }, + { + .map_name = "bmc_stats", + .p_map_fd = &bmc.map_stats_fd, + .pin_path = "/sys/fs/bpf/bmc/stats", + .is_stat_map = true, + }, +}; + +static int find_type_by_sec_name(const char *sec_name, + enum bpf_prog_type *p_prog_type, + enum bpf_attach_type *p_attach_type) +{ + int i; + + if (sec_name == NULL) { + fprintf(stderr, "sec_name is NULL\n"); + return -1; + } + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + if (!strcmp(prog_infos[i].sec_name, sec_name)) { + *p_prog_type = prog_infos[i].prog_type; + *p_attach_type = prog_infos[i].attach_type; + return 0; + } + } + + fprintf(stderr, "unknown prog %s\n", sec_name); + + return -1; +} + +static int set_prog_type(struct bpf_object *obj) +{ + const char *sec_name; + struct bpf_program *prog; + enum bpf_prog_type prog_type; + enum bpf_attach_type attach_type; + + bpf_object__for_each_program(prog, obj) { + sec_name = bpf_program__section_name(prog); + if (find_type_by_sec_name(sec_name, &prog_type, &attach_type)) + return -1; + bpf_program__set_type(prog, prog_type); + if (prog_type != BPF_PROG_TYPE_XDP) + bpf_program__set_expected_attach_type(prog, attach_type); + } + + return 0; +} + +static struct bpf_object *load_bpf_file(const char *bpf_file) +{ + int err; + char err_buf[256]; + struct bpf_object *obj; + + obj = bpf_object__open(bpf_file); + err = libbpf_get_error(obj); + if (err) { + libbpf_strerror(err, err_buf, sizeof(err_buf)); + fprintf(stderr, "unable to open bpf file %s : %s\n", bpf_file, + err_buf); + return NULL; + } + + if (set_prog_type(obj)) { + bpf_object__close(obj); + return NULL; + } + + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "load bpf object failed\n"); + bpf_object__close(obj); + return NULL; + } + + return obj; +} + +static int find_prog(struct bpf_object *obj, const char *sec_name, + struct bpf_program **p_prog, int *p_prog_fd) +{ + int fd; + struct bpf_program *prog; + + prog = bpf_object__find_program_by_title(obj, sec_name); + if (!prog) { + fprintf(stderr, "failed to find prog %s\n", sec_name); + return -1; + } + + fd = bpf_program__fd(prog); + if (fd < 0) { + fprintf(stderr, "failed to get fd of prog %s\n", sec_name); + return -1; + } + + *p_prog = prog; + *p_prog_fd = fd; + + return 0; +} + +static void unpin_progs(int n) +{ + int i; + + for (i = 0; i < n; i++) + bpf_program__unpin(prog_infos[i].prog, prog_infos[i].pin_path); +} + +static int find_progs(struct bpf_object *obj) +{ + int i; + struct bmc_prog_info *info; + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + info = &prog_infos[i]; + + if (find_prog(obj, info->sec_name, &info->prog, info->p_prog_fd)) + goto error_find_prog; + + if (bpf_program__pin(info->prog, info->pin_path)) + goto error_find_prog; + } + + return 0; + +error_find_prog: + unpin_progs(i); + return -1; +} + +static int find_map(struct bpf_object *obj, const char *map_name, + struct bpf_map **p_map, int *p_map_fd) +{ + int fd; + struct bpf_map *map; + + map = bpf_object__find_map_by_name(obj, map_name); + if (!map) { + fprintf(stderr, "failed to find map %s\n", map_name); + return -1; + } + + fd = bpf_map__fd(map); + if (fd < 0) { + fprintf(stderr, "failed to get fd of map %s\n", map_name); + return -1; + } + + + *p_map = map; + *p_map_fd = fd; + + return 0; +} + +static void unpin_maps(int n) +{ + int i; + + for (i = 0; i < n; i++) + bpf_map__unpin(map_infos[i].map, map_infos[i].pin_path); +} + +static int find_maps(struct bpf_object *obj) +{ + int i; + __u32 key; + __u32 value; + int fd; + struct bmc_map_info *info; + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) { + info = &map_infos[i]; + + if (find_map(obj, info->map_name, &info->map, info->p_map_fd)) + goto error_find_map; + + if (bpf_map__pin(info->map, info->pin_path)) { + fprintf(stderr, "failed to pin map %s to path %s\n", + info->map_name, info->pin_path); + goto error_find_map; + } + + if (info->is_interface_map) { + key = 0; + value = bmc.ifindex; + fd = bpf_map__fd(info->map); + bpf_map_update_elem(fd, &key, &value, 0); + } + } + + return 0; + +error_find_map: + unpin_maps(i); + return -1; +} + +static void detach_xdp_progs(unsigned int ifindex, __u32 flags) +{ + bpf_set_link_xdp_fd(ifindex, -1, flags); +} + +static void detach_progs(int n) +{ + int i; + struct bmc_prog_info *info; + + for (i = 0; i < n; i++) { + info = &prog_infos[i]; + if (info->is_xdp_main) + detach_xdp_progs(bmc.ifindex, info->attach_flags); + else if (info->prog_type != BPF_PROG_TYPE_XDP) + bpf_prog_detach(*info->p_prog_fd, info->attach_type); + } +} + +static int attach_xdp_prog(int prog_fd, __u32 flags) +{ + if (bmc.ifindex) { + if (bpf_set_link_xdp_fd(bmc.ifindex, prog_fd, flags)) { + fprintf(stderr, "failed to attach xdp prog\n"); + return -1; + } + } + return 0; +} + +static int attach_progs(struct bpf_object *obj) +{ + int i; + int err; + int prog_fd; + int attach_fd; + unsigned int flags; + enum bpf_attach_type type; + struct bmc_prog_info *info; + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + info = &prog_infos[i]; + prog_fd = *info->p_prog_fd; + flags = info->attach_flags; + + if (info->is_xdp_main) + err = attach_xdp_prog(prog_fd, flags); + else if (info->prog_type != BPF_PROG_TYPE_XDP && + info->p_attach_fd != NULL) { + attach_fd = *info->p_attach_fd; + type = info->attach_type; + err = bpf_prog_attach(prog_fd, attach_fd, type, flags); + } else + continue; + + if (err) { + fprintf(stderr, "attach prog %s failed!\n", + info->sec_name); + goto error_attach_prog; + } + } + + return 0; + +error_attach_prog: + detach_progs(i); + + return -1; +} + +static int add_bmc_port(void) +{ + int ret; + int map_fd = bmc.map_ports_fd; + uint16_t port = htons(bmc.listen_port); + uint32_t key = (uint32_t)port; + uint32_t value = 1; + + ret = bpf_map_update_elem(map_fd, &key, &value, 0); + if (ret) + fprintf(stderr, "failed to add port %u\n", port); + + return ret; +} + +static int setup_bpf(void) +{ + struct bpf_object *obj; + + bmc.cgroup_fd = open(bmc.cgroup_path, O_DIRECTORY, O_RDONLY); + if (bmc.cgroup_fd < 0) { + fprintf(stderr, "failed to open cgroup %s: %s\n", + bmc.cgroup_path, strerror(errno)); + return -1; + } + + obj = load_bpf_file(bmc.bpf_path); + if (!obj) + goto error_load_object; + + if (find_progs(obj)) + goto error_load_object; + + if (find_maps(obj)) + goto error_find_maps; + + if (attach_progs(obj)) + goto error_attach_progs; + + if (add_bmc_port()) + goto error_add_port; + + return 0; + +error_add_port: + detach_progs(ARRAY_SIZE(prog_infos)); +error_attach_progs: + unpin_maps(ARRAY_SIZE(map_infos)); +error_find_maps: + unpin_progs(ARRAY_SIZE(prog_infos)); +error_load_object: + bpf_object__close(obj); + close(bmc.cgroup_fd); + return -1; +} + +static int parse_load_args(int argc, char *argv[]) +{ + int opt; + int port; + const char *ifname = NULL; + + bmc.cgroup_path = DEFAULT_CGROUP_PATH; + bmc.listen_port = DEFAULT_REDIS_PORT; + bmc.ifindex = 0; + + while ((opt = getopt(argc, argv, "c:p:i:")) != -1) { + switch (opt) { + case 'c': + bmc.cgroup_path = optarg; + break; + case 'p': + port = atoi(optarg); + if (port <= 0 || port >= USHRT_MAX) { + fprintf(stderr, "invalid port: %s\n", optarg); + return -1; + } + bmc.listen_port = port; + break; + case 'i': + printf("interface: %s\n", optarg); + ifname = optarg; + bmc.ifindex = if_nametoindex(ifname); + break; + default: + fprintf(stderr, "unknown option %c\n", opt); + return -1; + } + } + + if (!bmc.ifindex) { + fprintf(stderr, "no netwrok interface found\n"); + return -1; + } + + if (optind >= argc) { + fprintf(stderr, "no bpf prog file found\n"); + return -1; + } + + bmc.bpf_path = argv[optind]; + + printf("bpf file: %s\n", bmc.bpf_path); + printf("cgroup path: %s\n", bmc.cgroup_path); + printf("listen port: %d\n", bmc.listen_port); + printf("interface: %s\n", ifname); + + return 0; +} + +struct cmd { + const char *name; + int (*func)(int argc, char *argv[]); +}; + +static int do_prog(int argc, char *argv[]); +static int do_stat(int argc, char *argv[]); + +static int do_prog_load(int argc, char *argv[]); +static int do_prog_unload(int argc, char *argv[]); + +static struct cmd main_cmds[] = { + { "prog", do_prog }, + { "stat", do_stat }, +}; + +static struct cmd prog_cmds[] = { + { "load", do_prog_load }, + { "unload", do_prog_unload }, +}; + +static char *elf_name; + +static int dispatch_cmd(struct cmd cmds[], int ncmd, int argc, + char *argv[], void (*help)(void)) +{ + int i; + int ret; + + if (argc <= 0) { + help(); + return -1; + } + + for (i = 0; i < ncmd; i++) { + if (!strcmp(argv[0], cmds[i].name)) { + ret = cmds[i].func(argc - 1, argv + 1); + if (ret == -2) { + help(); + ret = -1; + } + return ret; + } + } + + help(); + + return -1; +} + +static int do_prog_load(int argc, char *argv[]) +{ + if (parse_load_args(argc + 1, argv - 1) < 0) + return -2; + + if (setup_bpf()) + return -1; + + return 0; +} + +static int do_prog_unload(int argc, char *argv[]) +{ + int i; + int err; + int prog_fd; + int cgroup_fd; + int map_fd; + char *interface_map_path = NULL; + char *cgroup_path = DEFAULT_CGROUP_PATH; + __u32 ifindex; + __u32 key; + + if (argc > 1) + cgroup_path = argv[0]; + + cgroup_fd = open(cgroup_path, O_DIRECTORY, O_RDONLY); + if (cgroup_fd < 0) { + fprintf(stderr, "failed to open cgroup path: %s\n", + cgroup_path); + return -1; + } + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) { + if (map_infos[i].is_interface_map) { + interface_map_path = map_infos[i].pin_path; + break; + } + } + + if (!interface_map_path) { + fprintf(stderr, "no interface map found\n"); + return -1; + } + + map_fd = bpf_obj_get(interface_map_path); + if (map_fd < 0) { + fprintf(stderr, "failed to get map from %s\n", + interface_map_path); + return -1; + } + + key = 0; + err = bpf_map_lookup_elem(map_fd, &key, &ifindex); + close(map_fd); + if (err) { + fprintf(stderr, "lookup interface failed\n"); + return -1; + } + + for (i = 0; i < ARRAY_SIZE(prog_infos); i++) { + if (prog_infos[i].attach_type == BPF_CGROUP_SOCK_OPS) { + prog_fd = bpf_obj_get(prog_infos[i].pin_path); + if (prog_fd >= 0) + bpf_prog_detach2(prog_fd, cgroup_fd, + BPF_CGROUP_SOCK_OPS); + } + + if (prog_infos[i].is_xdp_main) + detach_xdp_progs(ifindex, prog_infos[i].attach_flags); + + unlink(prog_infos[i].pin_path); + } + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) + unlink(map_infos[i].pin_path); + + return 0; +} + +static void do_prog_help(void) +{ + fprintf(stderr, + "Usage: %s prog load [-c CGROUP_PATH] [-p LISTEN_PORT]" + " {-i INTERFACE} {BPF_FILE}\n" + " %s prog unload [CGROUP_PATH]\n", + elf_name, elf_name); +} + +static int do_prog(int argc, char *argv[]) +{ + return dispatch_cmd(prog_cmds, ARRAY_SIZE(prog_cmds), + argc, argv, do_prog_help); +} + +static int do_stat(int argc, char *argv[]) +{ + int i; + int fd; + int err; + int ncpu; + bool found = false; + struct bmc_map_info *info; + struct bpf_map_info map = {}; + struct redis_bmc_stat *percpu_stat; + struct redis_bmc_stat stat = {}; + __u32 len = sizeof(map); + __u32 key; + + ncpu = sysconf(_SC_NPROCESSORS_ONLN); + if (ncpu < 0) { + fprintf(stderr, "sysconf failed: %s\n", strerror(errno)); + return -1; + } + + percpu_stat = malloc(sizeof(struct redis_bmc_stat) * ncpu); + if (!percpu_stat) { + fprintf(stderr, "malloc percpu stat failed\n"); + return -1; + } + + for (i = 0; i < ARRAY_SIZE(map_infos); i++) { + info = &map_infos[i]; + if (info->is_stat_map) { + found = true; + break; + } + } + + if (!found) { + fprintf(stderr, "no stats map found\n"); + free(percpu_stat); + return -1; + } + + fd = bpf_obj_get(info->pin_path); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", + info->pin_path); + free(percpu_stat); + return -1; + } + + err = bpf_obj_get_info_by_fd(fd, &map, &len); + if (err) { + fprintf(stderr, "failed to get map info\n"); + err = -1; + goto out; + } + + if (map.type != BPF_MAP_TYPE_PERCPU_ARRAY) { + fprintf(stderr, "unexpected map type: %d\n", map.type); + err = -1; + goto out; + } + + if (map.key_size != sizeof(__u32)) { + fprintf(stderr, "unexpected map key_size: %u\n", map.key_size); + err = -1; + goto out; + } + + if (map.value_size != sizeof(struct redis_bmc_stat)) { + fprintf(stderr, "unexpected map key_size: %u\n", map.key_size); + err = -1; + goto out; + } + + key = 0; + err = bpf_map_lookup_elem(fd, &key, percpu_stat); + if (err) { + fprintf(stderr, "lookup cpu stat failed, cpu=%u\n", i); + err = -1; + goto out; + } + + for (int i = 0; i < ncpu; i++) { + stat.total_get_requests += percpu_stat[i].total_get_requests; + stat.hit_get_requests += percpu_stat[i].hit_get_requests; + stat.drop_get_requests += percpu_stat[i].drop_get_requests; + stat.total_set_requests += percpu_stat[i].total_set_requests; + stat.hit_set_requests += percpu_stat[i].hit_set_requests; + stat.drop_set_requests += percpu_stat[i].drop_set_requests; + } + + printf("Total GET Requests: %llu\n", stat.total_get_requests); + printf("Hit GET Requests: %llu (%.2f%%)\n", stat.hit_get_requests, + stat.total_get_requests == 0 ? 0 : + (double)stat.hit_get_requests / + (double)stat.total_get_requests * + 100); + printf("Dropped GET Requests: %llu (%.2lf%%)\n", stat.drop_get_requests, + stat.total_get_requests == 0 ? 0 : + (double)stat.drop_get_requests / + (double)stat.total_get_requests * + 100); + + printf("Total SET Requests: %llu\n", stat.total_set_requests); + printf("Hit SET Requests: %llu (%.2f%%)\n", stat.hit_set_requests, + stat.total_set_requests == 0 ? 0 : + (double)stat.hit_set_requests / + (double)stat.total_set_requests * + 100); + printf("Dropped SET Requests: %llu (%.2lf%%)\n", stat.drop_set_requests, + stat.total_set_requests == 0 ? 0 : + (double)stat.drop_set_requests / + (double)stat.total_set_requests * + 100); + +out: + close(fd); + free(percpu_stat); + + return err; +} + +static void do_main_help(void) +{ + fprintf(stderr, + "Usage: %s OBJECT { COMMAND | help }\n" + " OBJECT := { prog | stat }\n", + elf_name); +} + +int main(int argc, char *argv[]) +{ + elf_name = argv[0]; + + return dispatch_cmd(main_cmds, ARRAY_SIZE(main_cmds), + argc - 1, argv + 1, do_main_help); +}