Kernel
Threads by month
- ----- 2025 -----
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 3 participants
- 17953 discussions

12 Aug '22
From: Xu Kuohai <xukuohai(a)huawei.com>
hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5M05G
CVE: NA
-------------------------------------------------------
BMC is an in-kernel key-value cache implemented in BPF and proposed by
paper [1]. The paper discussed BMC for memcached, obtaining at least
6x performance speedup.
This patch implements a sample BMC for Redis. Paper [1] implements BMC
in XDP, bypassing the kernel network stack totally. Since Redis is based
on TCP protocol, and it's almost impossible to fully process TCP traffic
in XDP, so this patch implements BMC in sockmap, which locates at the
top of kernel network stack. Since kernel network stack is not bypassed,
the speedup is not significant. Any way, this is only a sample
implementation, and performance improvements can be continuously
optimized.
See [2] for details on how to build samples/bpf.
Output files:
samples/bpf/bmctool
samples/bpf/bmc/bpf.o
Sample usage:
bmctool prog load -p 6379 ./bmc/bpf.o # load bmc bpf prog and attach it
# to sockets with listen port 6379
bmctool stat # dump bmc status
bmctool prog unload # detach and unload bmc prog
[1] https://www.usenix.org/conference/nsdi21/presentation/ghigoff
[2] https://www.kernel.org/doc/readme/samples-bpf-README.rst
Signed-off-by: Xu Kuohai <xukuohai(a)huawei.com>
Reviewed-by: Yang Jihong <yangjihong1(a)huawei.com>
---
samples/bpf/Makefile | 3 +
samples/bpf/bmc/bpf.c | 144 ++++++++
samples/bpf/bmc/common.h | 21 ++
samples/bpf/bmc/redis.h | 648 ++++++++++++++++++++++++++++++++++
samples/bpf/bmc/tool.c | 733 +++++++++++++++++++++++++++++++++++++++
5 files changed, 1549 insertions(+)
create mode 100644 samples/bpf/bmc/bpf.c
create mode 100644 samples/bpf/bmc/common.h
create mode 100644 samples/bpf/bmc/redis.h
create mode 100644 samples/bpf/bmc/tool.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index aeebf5d12f32..f9bb6bdad6ce 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -54,6 +54,7 @@ tprogs-y += task_fd_query
tprogs-y += xdp_sample_pkts
tprogs-y += ibumad
tprogs-y += hbm
+tprogs-y += bmctool
# Libbpf dependencies
LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -111,6 +112,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
+bmctool-objs := bmc/tool.o
# Tell kbuild to always build the programs
always-y := $(tprogs-y)
@@ -172,6 +174,7 @@ always-y += ibumad_kern.o
always-y += hbm_out_kern.o
always-y += hbm_edt_kern.o
always-y += xdpsock_kern.o
+always-y += bmc/bpf.o
ifeq ($(ARCH), arm)
# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux
diff --git a/samples/bpf/bmc/bpf.c b/samples/bpf/bmc/bpf.c
new file mode 100644
index 000000000000..127260c611f8
--- /dev/null
+++ b/samples/bpf/bmc/bpf.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Description: BPF program to accelerate Redis. The idea is to add a kernel
+ * cache for Redis data. When new Redis request is received, the kernel cache
+ * is checked, and if the requested data is found in the cache, a Redis reply
+ * message is constructed and sent back directly.
+ */
+
+#include <uapi/linux/in.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+
+#define debug(fmt, ...) \
+do { \
+ char ___fmt[] = fmt; \
+ bpf_trace_printk(___fmt, sizeof(___fmt), ##__VA_ARGS__); \
+} while (0)
+
+struct tcp_key {
+ __u32 family;
+ __be32 local_ip4;
+ __be32 remote_ip4;
+ __be32 local_port;
+ __be32 remote_port;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(key_size, sizeof(struct tcp_key));
+ __uint(value_size, sizeof(u64));
+ __uint(max_entries, 1024);
+} bmc_socks SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 16);
+} bmc_ports SEC(".maps");
+
+SEC("bmc/sock_parser")
+int sock_parser(struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+static void init_tcp_key(struct tcp_key *key, struct bpf_sock *sk)
+{
+ if (sk != NULL) {
+ key->family = sk->family;
+ key->local_ip4 = sk->src_ip4;
+ key->remote_ip4 = sk->dst_ip4;
+ key->local_port = htonl(sk->src_port);
+ key->remote_port = htonl((u32)ntohs(sk->dst_port));
+ }
+}
+
+static int sock_redirect(struct __sk_buff *skb)
+{
+ struct tcp_key key;
+ struct bpf_sock *sk;
+
+ sk = skb->sk;
+ if (sk == NULL)
+ return SK_PASS;
+
+ init_tcp_key(&key, sk);
+ return bpf_sk_redirect_hash(skb, &bmc_socks, &key, 0);
+}
+
+#include "redis.h"
+
+SEC("bmc/sock_verdict")
+int sock_verdict(struct __sk_buff *skb)
+{
+ return bmc_process(skb);
+}
+
+static bool is_bmc_port(u32 port)
+{
+ u32 *val = bpf_map_lookup_elem(&bmc_ports, &port);
+
+ return val != NULL && *val != 0;
+}
+
+static void add_bmc_sock(struct bpf_sock_ops *skops, struct bpf_sock *sk)
+{
+ struct tcp_key key;
+
+ init_tcp_key(&key, sk);
+ bpf_sock_hash_update(skops, &bmc_socks, &key, BPF_ANY);
+}
+
+static void delete_bmc_sock(struct bpf_sock_ops *skops, struct bpf_sock *sk)
+{
+ struct tcp_key key;
+
+ init_tcp_key(&key, sk);
+ bpf_map_delete_elem(&bmc_socks, &key);
+}
+
+SEC("bmc/sock_ops")
+int sock_ops(struct bpf_sock_ops *skops)
+{
+ int op;
+ u16 local_port;
+ struct tcp_key key;
+ struct bpf_sock *sk;
+
+ sk = skops->sk;
+ if (skops->family != AF_INET || sk == NULL)
+ return 0;
+
+ local_port = ntohs((u16)sk->src_port);
+
+ switch ((int)skops->op) {
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ if (is_bmc_port(local_port)) {
+ bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG);
+ add_bmc_sock(skops, sk);
+ }
+ break;
+
+ case BPF_SOCK_OPS_STATE_CB:
+ if ((int)skops->args[1] == BPF_TCP_CLOSE)
+ delete_bmc_sock(skops, sk);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/bmc/common.h b/samples/bpf/bmc/common.h
new file mode 100644
index 000000000000..51c8623ab4f8
--- /dev/null
+++ b/samples/bpf/bmc/common.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ * Description: common header for both user prog and bpf kernel prog
+ */
+#ifndef __REDIS_BMC_COMMON_H__
+#define __REDIS_BMC_COMMON_H__
+
+#define REDIS_GET_PROG_INDEX 0
+#define REDIS_SET_PROG_INDEX 1
+
+struct redis_bmc_stat {
+ __u64 total_get_requests;
+ __u64 hit_get_requests;
+ __u64 drop_get_requests;
+ __u64 total_set_requests;
+ __u64 hit_set_requests;
+ __u64 drop_set_requests;
+};
+
+#endif
diff --git a/samples/bpf/bmc/redis.h b/samples/bpf/bmc/redis.h
new file mode 100644
index 000000000000..6e739ce3d81a
--- /dev/null
+++ b/samples/bpf/bmc/redis.h
@@ -0,0 +1,648 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ *
+ * Description: This file parses REDIS commands. When SET command is received,
+ * the KEY and VALUE fields are extracted from the message and are stored to
+ * bmc_storage. When GET command is received, we lookup bmc_storage with the
+ * KEY received, and if success we fill the reply message with the found VALUE
+ * and send it back to the client.
+ *
+ * Here is a sample redis SET and GET session:
+ * (C: is the client, S: the server)
+ *
+ * C: "*3\r\n$3\r\nset\r\n$5\r\nkey01\r\n$5\r\nval01\r\n"
+ * S: "+OK\r\n"
+ * C: "*2\r\n$3\r\nget\r\n$5\r\nkey01\r\n"
+ * S: "$5\r\nval01\r\n"
+ *
+ * See [0] for RESP protocol details.
+ * [0] https://redis.io/docs/reference/protocol-spec/
+ */
+
+#include "common.h"
+
+#define BMC_MAX_REDIS_KEY_LEN 448 /* total key size should be less than 512 */
+#define BMC_MAX_REDIS_VALUE_LEN 2048
+#define BMC_MAX_CPUS 512 // NR_CPUS
+
+struct redis_key {
+ u32 len;
+ u8 data[BMC_MAX_REDIS_KEY_LEN];
+};
+
+struct redis_value {
+ u32 len;
+ u8 data[BMC_MAX_REDIS_VALUE_LEN];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __uint(key_size, sizeof(struct redis_key));
+ __uint(value_size, sizeof(struct redis_value));
+ __uint(max_entries, 10000);
+} bmc_storage SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 2);
+} bmc_jump_table SEC(".maps");
+
+struct redis_ctx {
+ struct redis_key key;
+ struct redis_value value;
+ u32 offset;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(struct redis_ctx));
+ __uint(max_entries, BMC_MAX_CPUS);
+} ctxmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(struct redis_bmc_stat));
+ __uint(max_entries, BMC_MAX_CPUS);
+} bmc_stats SEC(".maps");
+
+static int bmc_copy_from_skb(void *dst, void *dend,
+ struct __sk_buff *skb,
+ u32 skb_off, u32 len)
+{
+ u32 i;
+ u32 off = 0;
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+
+ if (len > 2047)
+ return -1;
+
+ if (len >= 1024 && dst + off + 1024 < dend &&
+ data + skb_off + off + 1024 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 1024))
+ return -1;
+ off += 1024;
+ len -= 1024;
+ }
+ if (len >= 512 && dst + off + 512 < dend &&
+ data + skb_off + off + 512 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 512))
+ return -1;
+ off += 512;
+ len -= 512;
+ }
+ if (len >= 256 && dst + off + 256 < dend &&
+ data + skb_off + off + 256 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 256))
+ return -1;
+ off += 256;
+ len -= 256;
+ }
+ if (len >= 128 && dst + off + 128 < dend &&
+ data + skb_off + off + 128 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 128))
+ return -1;
+ off += 128;
+ len -= 128;
+ }
+ if (len >= 64 && dst + off + 64 < dend &&
+ data + skb_off + off + 64 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 64))
+ return -1;
+ off += 64;
+ len -= 64;
+ }
+ if (len >= 32 && dst + off + 32 < dend &&
+ data + skb_off + off + 32 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 32))
+ return -1;
+ off += 32;
+ len -= 32;
+ }
+ if (len >= 16 && dst + off + 16 < dend &&
+ data + skb_off + off + 16 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 16))
+ return -1;
+ off += 16;
+ len -= 16;
+ }
+
+ if (len >= 8 && dst + off + 8 < dend &&
+ data + skb_off + off + 8 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 8))
+ return -1;
+ off += 8;
+ len -= 8;
+ }
+
+ if (len >= 4 && dst + off + 4 < dend &&
+ data + skb_off + off + 4 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 4))
+ return -1;
+ off += 4;
+ len -= 4;
+ }
+
+ if (len >= 2 && dst + off + 2 < dend &&
+ data + skb_off + off + 2 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 2))
+ return -1;
+ off += 2;
+ len -= 2;
+ }
+
+ if (len >= 1 && dst + off + 1 < dend &&
+ data + skb_off + off + 1 < data_end) {
+ if (bpf_skb_load_bytes(skb, skb_off + off, dst + off, 1))
+ return -1;
+ off += 1;
+ len -= 1;
+ }
+
+ return len == 0 ? 0 : -1;
+}
+
+static int bmc_copy_to_skb(struct __sk_buff *skb, u32 skb_off,
+ void *dst, void *dend, u32 len)
+{
+ u32 i;
+ u32 off = 0;
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+
+ if (len > 2047)
+ return -1;
+
+ if (len >= 1024 && dst + off + 1024 < dend &&
+ data + skb_off + off + 1024 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 1024, 0))
+ return -1;
+ off += 1024;
+ len -= 1024;
+ }
+ if (len >= 512 && dst + off + 512 < dend &&
+ data + skb_off + off + 512 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 512, 0))
+ return -1;
+ off += 512;
+ len -= 512;
+ }
+ if (len >= 256 && dst + off + 256 < dend &&
+ data + skb_off + off + 256 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 256, 0))
+ return -1;
+ off += 256;
+ len -= 256;
+ }
+ if (len >= 128 && dst + off + 128 < dend &&
+ data + skb_off + off + 128 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 128, 0))
+ return -1;
+ off += 128;
+ len -= 128;
+ }
+ if (len >= 64 && dst + off + 64 < dend &&
+ data + skb_off + off + 64 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 64, 0))
+ return -1;
+ off += 64;
+ len -= 64;
+ }
+ if (len >= 32 && dst + off + 32 < dend &&
+ data + skb_off + off + 32 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 32, 0))
+ return -1;
+ off += 32;
+ len -= 32;
+ }
+ if (len >= 16 && dst + off + 16 < dend &&
+ data + skb_off + off + 16 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 16, 0))
+ return -1;
+ off += 16;
+ len -= 16;
+ }
+
+ if (len >= 8 && dst + off + 8 < dend &&
+ data + skb_off + off + 8 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 8, 0))
+ return -1;
+ off += 8;
+ len -= 8;
+ }
+
+ if (len >= 4 && dst + off + 4 < dend &&
+ data + skb_off + off + 4 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 4, 0))
+ return -1;
+ off += 4;
+ len -= 4;
+ }
+
+ if (len >= 2 && dst + off + 2 < dend &&
+ data + skb_off + off + 2 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 2, 0))
+ return -1;
+ off += 2;
+ len -= 2;
+ }
+
+ if (len >= 1 && dst + off + 1 < dend &&
+ data + skb_off + off + 1 < data_end) {
+ if (bpf_skb_store_bytes(skb, skb_off + off, dst + off, 1, 0))
+ return -1;
+ off += 1;
+ len -= 1;
+ }
+
+ return len == 0 ? 0 : -1;
+}
+
+static inline struct redis_ctx *get_ctx(void)
+{
+ u32 cpu = bpf_get_smp_processor_id();
+
+ if (cpu >= BMC_MAX_CPUS)
+ return NULL;
+ return bpf_map_lookup_elem(&ctxmap, &cpu);
+}
+
+static inline struct redis_bmc_stat *get_stat(void)
+{
+ u32 cpu = bpf_get_smp_processor_id();
+
+ if (cpu >= BMC_MAX_CPUS)
+ return NULL;
+ return bpf_map_lookup_elem(&bmc_stats, &cpu);
+}
+
+static int do_redis_get_handler(struct __sk_buff *skb, struct redis_ctx *ctx)
+{
+ int i;
+ u32 n;
+ int err;
+ char *p;
+ char *data;
+ char *data_end;
+ char buf[5];
+ struct redis_value *val;
+
+ ctx = get_ctx();
+ if (!ctx)
+ return BPF_OK;
+
+ val = bpf_map_lookup_elem(&bmc_storage, &ctx->key);
+ if (val == NULL || val->len == 0 || val->len > sizeof(val->data))
+ return BPF_OK;
+
+ n = val->len;
+
+ i = 0;
+ while (i < 5) {
+ buf[i] = '0' + n % 10;
+ n = n / 10;
+ i++;
+ if (n == 0)
+ break;
+ }
+
+ if (i >= 5)
+ return BPF_OK;
+
+ /* $ LEN \r \n VALUE \r \n */
+ n = 1 + i + 2 + val->len + 2;
+
+ if (n > skb->len)
+ /* extend head space */
+ err = bpf_skb_change_head(skb, n - skb->len, 0);
+ else if (n < skb->len)
+ /* shrink head space */
+ err = bpf_skb_adjust_room(skb, -(skb->len - n), 0, 0);
+
+ if (err)
+ return BPF_DROP;
+
+ data = (char *)(long)skb->data;
+ data_end = (char *)(long)skb->data_end;
+ p = data;
+ /* 3 is '$' and "\r\n"*/
+ if (p + i + 3 > data_end)
+ return BPF_DROP;
+
+ *p++ = '$';
+ while (p < data_end && --i >= 0)
+ *p++ = buf[i];
+ *p++ = '\r';
+ *p++ = '\n';
+
+ n = val->len;
+ if (n == 0 || n > sizeof(val->data) || p + n + 2 > data_end)
+ return BPF_DROP;
+
+ if (bmc_copy_to_skb(skb, p - data, val->data,
+ val->data + sizeof(val->data), n))
+ return BPF_DROP;
+
+ p += n;
+ char end_mark[] = { '\r', '\n'};
+
+ bpf_skb_store_bytes(skb, p - data, end_mark, sizeof(end_mark), 0);
+
+ return BPF_REDIRECT;
+}
+
+static int do_redis_set_handler(struct __sk_buff *skb, struct redis_ctx *ctx)
+{
+ int err;
+ u32 off = 0;
+ u32 value_len;
+ char *data = (char *)(long)skb->data;
+ char *data_end = (char *)(long)skb->data_end;
+
+ if (data + 1 > data_end || data[0] != '$')
+ return BPF_OK;
+ off++;
+ data++;
+
+ value_len = 0;
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ value_len = value_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ value_len = value_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ value_len = value_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ value_len = value_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+
+ if (data + 2 > data_end || data[0] != '\r' || data[1] != '\n')
+ return BPF_OK;
+ off += 2;
+ data += 2;
+
+ if (data > data_end)
+ return BPF_OK;
+
+ /* format error */
+ if (value_len <= 0 || value_len > sizeof(ctx->value.data) ||
+ value_len >= data_end - data) {
+ return BPF_OK;
+ }
+
+ if (bmc_copy_from_skb(ctx->value.data,
+ ctx->value.data + sizeof(ctx->value.data),
+ skb, off, value_len))
+ return BPF_OK;
+
+ ctx->value.len = value_len;
+
+ if (bpf_map_update_elem(&bmc_storage, &ctx->key, &ctx->value, BPF_ANY)) {
+ bpf_map_delete_elem(&bmc_storage, &ctx->key);
+ return BPF_OK;
+ }
+
+ char reply[] = { '+', 'O', 'K', '\r', '\n'};
+
+ if (skb->len < sizeof(reply))
+ /* extend head space */
+ err = bpf_skb_change_head(skb, sizeof(reply) - skb->len, 0);
+ else
+ /* shrink head space */
+ err = bpf_skb_adjust_room(skb, -(skb->len - sizeof(reply)), 0, 0);
+
+ if (err)
+ return BPF_OK;
+
+ bpf_skb_store_bytes(skb, 0, reply, sizeof(reply), 0);
+
+ return BPF_REDIRECT;
+}
+
+SEC("bmc/redis_get_handler")
+int redis_get_handler(struct __sk_buff *skb)
+{
+ int err;
+ struct redis_bmc_stat *stat;
+ struct redis_ctx *ctx;
+
+ stat = get_stat();
+ if (!stat)
+ return SK_PASS;
+
+ stat->total_get_requests++;
+
+ ctx = get_ctx();
+ if (!ctx)
+ return SK_PASS;
+
+ err = do_redis_get_handler(skb, ctx);
+ if (err == BPF_REDIRECT) {
+ stat->hit_get_requests++;
+ return sock_redirect(skb);
+ }
+
+ if (err == BPF_DROP) {
+ stat->drop_get_requests++;
+ return SK_DROP;
+ }
+
+ return SK_PASS;
+}
+
+SEC("bmc/redis_set_handler")
+int redis_set_handler(struct __sk_buff *skb)
+{
+ int err;
+ struct redis_bmc_stat *stat;
+ struct redis_ctx *ctx;
+
+ stat = get_stat();
+ if (!stat)
+ return SK_PASS;
+
+ stat->total_set_requests++;
+
+ ctx = get_ctx();
+ if (!ctx)
+ return SK_PASS;
+
+ err = do_redis_set_handler(skb, ctx);
+ if (err == BPF_REDIRECT) {
+ stat->hit_set_requests++;
+ return sock_redirect(skb);
+ }
+
+ if (err == BPF_DROP) {
+ stat->drop_set_requests++;
+ return SK_DROP;
+ }
+
+ err = bpf_skb_adjust_room(skb, ctx->offset, 0, 0);
+ if (!err)
+ return SK_PASS;
+
+ stat->drop_set_requests++;
+ return SK_DROP;
+}
+
+static inline int bmc_process(struct __sk_buff *skb)
+{
+ u32 off;
+ int err;
+ u32 key_len;
+ char *data;
+ char *data_end;
+ int expect_get = 0;
+ int is_get = 0;
+ struct redis_ctx *ctx;
+
+ ctx = get_ctx();
+ if (ctx == NULL)
+ return SK_PASS;
+
+ err = bpf_skb_pull_data(skb, skb->len);
+ if (err)
+ return SK_PASS;
+
+ off = 0;
+ data = (char *)(long)skb->data;
+ data_end = (char *)(long)skb->data_end;
+
+ /*
+ * SET message format:
+ * "*3\r\n" // this is an array with 3 elements
+ * "$3\r\n" // the first element is a string with 3 characters
+ * "set\r\n" // the string is "set"
+ * "$5\r\n" // the second element is a string with 5 characters
+ * "key01\r\n" // the string is "key01"
+ * "$5\r\n" // the third element is a string with 5 characters
+ * "val01\r\n" // the string is "valu01"
+ *
+ * GET message format:
+ * "*2\r\n" // this is an array with 3 elements
+ * "$3\r\n" // the first element is a string with 3 characters
+ * "get\r\n" // the string is "get"
+ * "$5\r\n" // the second element is a string with 5 characters
+ * "key01\r\n" // the string is "key01"
+ */
+ if (data + 4 > data_end)
+ return SK_PASS;
+
+ /* Not GET, Not SET */
+ if (data[0] != '*' || (data[1] != '2' && data[1] != '3') ||
+ data[2] != '\r' || data[3] != '\n')
+ return SK_PASS;
+
+ expect_get = (data[1] == '2');
+ off += 4;
+ data += 4;
+
+ if (data + 4 > data_end)
+ return SK_PASS;
+
+ if (data[0] != '$' || data[1] != '3' || data[2] != '\r' ||
+ data[3] != '\n')
+ return SK_PASS;
+
+ off += 4;
+ data += 4;
+
+ if (data + 5 > data_end)
+ return SK_PASS;
+
+ switch (data[0]) {
+ case 'g':
+ is_get = 1;
+ case 's':
+ if (data[1] != 'e' || data[2] != 't' ||
+ data[3] != '\r' || data[4] != '\n')
+ return SK_PASS;
+ break;
+ case 'G':
+ is_get = 1;
+ case 'S':
+ if (data[1] != 'E' || data[2] != 'T' ||
+ data[3] != '\r' || data[4] != '\n')
+ return SK_PASS;
+ break;
+ default:
+ return SK_PASS;
+ }
+ off += 5;
+ data += 5;
+
+ if (expect_get != is_get)
+ return SK_PASS;
+
+ if (data + 1 > data_end || data[0] != '$')
+ return SK_PASS;
+ off++;
+ data++;
+
+ key_len = 0;
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ key_len = key_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ key_len = key_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ key_len = key_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+ if (data < data_end && data[0] >= '0' && data[0] <= '9') {
+ key_len = key_len * 10 + data[0] - '0';
+ off++;
+ data++;
+ }
+ if (data + 2 > data_end || data[0] != '\r' || data[1] != '\n')
+ return SK_PASS;
+ off += 2;
+ data += 2;
+
+ if (data > data_end)
+ return SK_PASS;
+
+ if (key_len == 0 || key_len > sizeof(ctx->key.data) ||
+ key_len >= data_end - data)
+ return SK_PASS;
+
+ ctx->offset = off + key_len + 2;
+ ctx->key.len = key_len;
+
+ if (bmc_copy_from_skb(ctx->key.data,
+ ctx->key.data + sizeof(ctx->key.data),
+ skb, off, key_len))
+ return SK_PASS;
+
+ if (is_get) {
+ bpf_tail_call(skb, &bmc_jump_table, REDIS_GET_PROG_INDEX);
+ } else {
+ err = bpf_skb_adjust_room(skb, -ctx->offset, 0, 0);
+ if (err)
+ return SK_PASS;
+ bpf_tail_call(skb, &bmc_jump_table, REDIS_SET_PROG_INDEX);
+ }
+ return SK_PASS;
+}
diff --git a/samples/bpf/bmc/tool.c b/samples/bpf/bmc/tool.c
new file mode 100644
index 000000000000..e45be64a2819
--- /dev/null
+++ b/samples/bpf/bmc/tool.c
@@ -0,0 +1,733 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/select.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "common.h"
+
+#define DEFAULT_CGROUP_PATH "/sys/fs/cgroup"
+#define DEFAULT_REDIS_PORT 6379
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+#endif
+
+struct {
+ char *cgroup_path;
+ char *bpf_path;
+ int cgroup_fd;
+ int map_socks_fd;
+ int map_ports_fd;
+ int map_storage_fd;
+ int map_jump_table_fd;
+ int map_stats_fd;
+ int sock_parser_prog_fd;
+ int sock_verdict_prog_fd;
+ int sock_ops_prog_fd;
+ int redis_get_prog_fd;
+ int redis_set_prog_fd;
+ uint16_t listen_port;
+} bmc;
+
+struct bmc_prog_info {
+ const char *sec_name;
+ enum bpf_prog_type prog_type;
+ enum bpf_attach_type attach_type;
+ int *p_prog_fd;
+ int *p_attach_fd;
+ unsigned int attach_flags;
+ const char *pin_path;
+ struct bpf_program *prog;
+};
+
+struct bmc_map_info {
+ const char *map_name;
+ int *p_map_fd;
+ char *pin_path;
+ struct bpf_map *map;
+ bool is_stat_map;
+};
+
+static struct bmc_prog_info prog_infos[] = {
+ {
+ .sec_name = "bmc/sock_parser",
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .attach_type = BPF_SK_SKB_STREAM_PARSER,
+ .p_prog_fd = &bmc.sock_parser_prog_fd,
+ .p_attach_fd = &bmc.map_socks_fd,
+ .attach_flags = 0,
+ .pin_path = "/sys/fs/bpf/bmc/prog_sock_parser"
+ },
+ {
+ .sec_name = "bmc/sock_verdict",
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .attach_type = BPF_SK_SKB_STREAM_VERDICT,
+ .p_prog_fd = &bmc.sock_verdict_prog_fd,
+ .p_attach_fd = &bmc.map_socks_fd,
+ .attach_flags = 0,
+ .pin_path = "/sys/fs/bpf/bmc/prog_sock_verdict"
+ },
+ {
+ .sec_name = "bmc/sock_ops",
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+ .attach_type = BPF_CGROUP_SOCK_OPS,
+ .p_prog_fd = &bmc.sock_ops_prog_fd,
+ .p_attach_fd = &bmc.cgroup_fd,
+ .attach_flags = 0,
+ .pin_path = "/sys/fs/bpf/bmc/prog_sock_ops"
+ },
+ {
+ .sec_name = "bmc/redis_get_handler",
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .p_prog_fd = &bmc.redis_get_prog_fd,
+ .p_attach_fd = NULL,
+ .attach_flags = 0,
+ .pin_path = "/sys/fs/bpf/bmc/prog_redis_get_handler"
+
+ },
+ {
+ .sec_name = "bmc/redis_set_handler",
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .p_prog_fd = &bmc.redis_set_prog_fd,
+ .p_attach_fd = NULL,
+ .attach_flags = 0,
+ .pin_path = "/sys/fs/bpf/bmc/prog_redis_set_handler"
+
+ }
+};
+
+static struct bmc_map_info map_infos[] = {
+ {
+ .map_name = "bmc_socks",
+ .p_map_fd = &bmc.map_socks_fd,
+ .pin_path = "/sys/fs/bpf/bmc/map_socks"
+ },
+ {
+ .map_name = "bmc_ports",
+ .p_map_fd = &bmc.map_ports_fd,
+ .pin_path = "/sys/fs/bpf/bmc/map_ports"
+ },
+ {
+ .map_name = "bmc_storage",
+ .p_map_fd = &bmc.map_storage_fd,
+ .pin_path = "/sys/fs/bpf/bmc/map_storage"
+ },
+ {
+ .map_name = "bmc_jump_table",
+ .p_map_fd = &bmc.map_jump_table_fd,
+ .pin_path = "/sys/fs/bpf/bmc/map_jump_table"
+ },
+ {
+ .map_name = "bmc_stats",
+ .p_map_fd = &bmc.map_stats_fd,
+ .pin_path = "/sys/fs/bpf/bmc/stats",
+ .is_stat_map = true,
+ },
+};
+
+static int find_type_by_sec_name(const char *sec_name,
+ enum bpf_prog_type *p_prog_type,
+ enum bpf_attach_type *p_attach_type)
+{
+ int i;
+
+ if (sec_name == NULL) {
+ fprintf(stderr, "sec_name is NULL\n");
+ return -1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(prog_infos); i++) {
+ if (!strcmp(prog_infos[i].sec_name, sec_name)) {
+ *p_prog_type = prog_infos[i].prog_type;
+ *p_attach_type = prog_infos[i].attach_type;
+ return 0;
+ }
+ }
+
+ fprintf(stderr, "unknown prog %s\n", sec_name);
+
+ return -1;
+}
+
+static int set_prog_type(struct bpf_object *obj)
+{
+ const char *sec_name;
+ struct bpf_program *prog;
+ enum bpf_prog_type prog_type;
+ enum bpf_attach_type attach_type;
+
+ bpf_object__for_each_program(prog, obj) {
+ sec_name = bpf_program__section_name(prog);
+ if (find_type_by_sec_name(sec_name, &prog_type, &attach_type))
+ return -1;
+ bpf_program__set_type(prog, prog_type);
+ bpf_program__set_expected_attach_type(prog, attach_type);
+ }
+
+ return 0;
+}
+
+static struct bpf_object *load_bpf_file(const char *bpf_file)
+{
+ int err;
+ char err_buf[256];
+ struct bpf_object *obj;
+
+ obj = bpf_object__open(bpf_file);
+ err = libbpf_get_error(obj);
+ if (err) {
+ libbpf_strerror(err, err_buf, sizeof(err_buf));
+ fprintf(stderr, "unable to open bpf file %s : %s\n", bpf_file,
+ err_buf);
+ return NULL;
+ }
+
+ if (set_prog_type(obj)) {
+ bpf_object__close(obj);
+ return NULL;
+ }
+
+ err = bpf_object__load(obj);
+ if (err) {
+ fprintf(stderr, "load bpf object failed\n");
+ bpf_object__close(obj);
+ return NULL;
+ }
+
+ return obj;
+}
+
+static int find_prog(struct bpf_object *obj, const char *sec_name,
+ struct bpf_program **p_prog, int *p_prog_fd)
+{
+ int fd;
+ struct bpf_program *prog;
+
+ prog = bpf_object__find_program_by_title(obj, sec_name);
+ if (!prog) {
+ fprintf(stderr, "failed to find prog %s\n", sec_name);
+ return -1;
+ }
+
+ fd = bpf_program__fd(prog);
+ if (fd < 0) {
+ fprintf(stderr, "failed to get fd of prog %s\n", sec_name);
+ return -1;
+ }
+
+
+ *p_prog = prog;
+ *p_prog_fd = fd;
+
+ return 0;
+}
+
+static void unpin_progs(int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ bpf_program__unpin(prog_infos[i].prog, prog_infos[i].pin_path);
+}
+
+static int find_progs(struct bpf_object *obj)
+{
+ int i;
+ struct bmc_prog_info *info;
+
+ for (i = 0; i < ARRAY_SIZE(prog_infos); i++) {
+ info = &prog_infos[i];
+
+ if (find_prog(obj, info->sec_name, &info->prog, info->p_prog_fd))
+ goto error_find_prog;
+
+ if (bpf_program__pin(info->prog, info->pin_path))
+ goto error_find_prog;
+ }
+
+ return 0;
+
+error_find_prog:
+ unpin_progs(i);
+ return -1;
+}
+
+static int find_map(struct bpf_object *obj, const char *map_name,
+ struct bpf_map **p_map, int *p_map_fd)
+{
+ int fd;
+ struct bpf_map *map;
+
+ map = bpf_object__find_map_by_name(obj, map_name);
+ if (!map) {
+ fprintf(stderr, "failed to find map %s\n", map_name);
+ return -1;
+ }
+
+ fd = bpf_map__fd(map);
+ if (fd < 0) {
+ fprintf(stderr, "failed to get fd of map %s\n", map_name);
+ return -1;
+ }
+
+
+ *p_map = map;
+ *p_map_fd = fd;
+
+ return 0;
+}
+
+static void unpin_maps(int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ bpf_map__unpin(map_infos[i].map, map_infos[i].pin_path);
+}
+
+static int find_maps(struct bpf_object *obj)
+{
+ int i;
+ struct bmc_map_info *info;
+
+ for (i = 0; i < ARRAY_SIZE(map_infos); i++) {
+ info = &map_infos[i];
+
+ if (find_map(obj, info->map_name, &info->map, info->p_map_fd))
+ goto error_find_map;
+
+ if (bpf_map__pin(info->map, info->pin_path)) {
+ fprintf(stderr, "failed to pin map %s to path %s\n",
+ info->map_name, info->pin_path);
+ goto error_find_map;
+ }
+ }
+
+ return 0;
+
+error_find_map:
+ unpin_maps(i);
+ return -1;
+}
+
+static void detach_progs(int n)
+{
+ int i;
+ struct bmc_prog_info *info;
+
+ for (i = 0; i < n; i++) {
+ info = &prog_infos[i];
+ bpf_prog_detach(*info->p_prog_fd, info->attach_type);
+ }
+}
+
+static int attach_progs(struct bpf_object *obj)
+{
+ int i;
+ int prog_fd;
+ int attach_fd;
+ unsigned int flags;
+ enum bpf_attach_type type;
+ struct bmc_prog_info *info;
+
+ for (i = 0; i < ARRAY_SIZE(prog_infos); i++) {
+ info = &prog_infos[i];
+ if (!info->p_attach_fd)
+ continue;
+ prog_fd = *info->p_prog_fd;
+ attach_fd = *info->p_attach_fd;
+ type = info->attach_type;
+ flags = info->attach_flags;
+
+ if (bpf_prog_attach(prog_fd, attach_fd, type, flags)) {
+ fprintf(stderr, "attach prog %s failed!\n",
+ info->sec_name);
+ goto error_attach_prog;
+ }
+ }
+
+ return 0;
+
+error_attach_prog:
+ detach_progs(i);
+
+ return -1;
+}
+
+static int add_bmc_port(void)
+{
+ int ret;
+ int map_fd = bmc.map_ports_fd;
+ uint16_t port = htons(bmc.listen_port);
+ uint32_t key = (uint32_t)port;
+ uint32_t value = 1;
+
+ ret = bpf_map_update_elem(map_fd, &key, &value, 0);
+ if (ret)
+ fprintf(stderr, "failed to add port %u\n", port);
+
+ return ret;
+}
+
+static int add_tail_call(void)
+{
+ int ret;
+ int map_fd = bmc.map_jump_table_fd;
+ __u32 key;
+ __u32 value;
+
+ key = REDIS_GET_PROG_INDEX;
+ value = bmc.redis_get_prog_fd;
+ ret = bpf_map_update_elem(map_fd, &key, &value, 0);
+ if (ret) {
+ fprintf(stderr, "failed to add redis get tail call prog\n");
+ return -1;
+ }
+
+ key = REDIS_SET_PROG_INDEX;
+ value = bmc.redis_set_prog_fd;
+ ret = bpf_map_update_elem(map_fd, &key, &value, 0);
+ if (ret) {
+ fprintf(stderr, "failed to add redis set tail call prog\n");
+ key = REDIS_GET_PROG_INDEX;
+ bpf_map_delete_elem(map_fd, &key);
+ }
+
+ return ret;
+}
+
+static int setup_bpf(void)
+{
+ struct bpf_object *obj;
+
+ bmc.cgroup_fd = open(bmc.cgroup_path, O_DIRECTORY, O_RDONLY);
+ if (bmc.cgroup_fd < 0) {
+ fprintf(stderr, "failed to open cgroup %s: %s\n",
+ bmc.cgroup_path, strerror(errno));
+ return -1;
+ }
+
+ obj = load_bpf_file(bmc.bpf_path);
+ if (!obj)
+ goto error_load_object;
+
+ if (find_progs(obj))
+ goto error_load_object;
+
+ if (find_maps(obj))
+ goto error_find_maps;
+
+ if (attach_progs(obj))
+ goto error_attach_progs;
+
+ if (add_bmc_port())
+ goto error_add_port;
+
+ if (add_tail_call())
+ goto error_attach_progs;
+
+ return 0;
+
+error_add_port:
+ detach_progs(ARRAY_SIZE(prog_infos));
+error_attach_progs:
+ unpin_maps(ARRAY_SIZE(map_infos));
+error_find_maps:
+ unpin_progs(ARRAY_SIZE(prog_infos));
+error_load_object:
+ bpf_object__close(obj);
+ close(bmc.cgroup_fd);
+ return -1;
+}
+
+static int parse_load_args(int argc, char *argv[])
+{
+ int opt;
+ int port;
+
+ bmc.cgroup_path = DEFAULT_CGROUP_PATH;
+ bmc.listen_port = DEFAULT_REDIS_PORT;
+
+ while ((opt = getopt(argc, argv, "c:p:")) != -1) {
+ switch (opt) {
+ case 'c':
+ bmc.cgroup_path = optarg;
+ break;
+ case 'p':
+ port = atoi(optarg);
+ if (port <= 0 || port >= USHRT_MAX) {
+ fprintf(stderr, "invalid port: %s\n", optarg);
+ return -1;
+ }
+ bmc.listen_port = port;
+ break;
+ default:
+ fprintf(stderr, "unknown option %c\n", opt);
+ return -1;
+ }
+ }
+
+ if (optind >= argc) {
+ fprintf(stderr, "no bpf prog file found\n");
+ return -1;
+ }
+
+ bmc.bpf_path = argv[optind];
+
+ printf("bpf file: %s\n", bmc.bpf_path);
+ printf("cgroup path: %s\n", bmc.cgroup_path);
+ printf("listen port: %d\n", bmc.listen_port);
+
+ return 0;
+}
+
+struct cmd {
+ const char *name;
+ int (*func)(int argc, char *argv[]);
+};
+
+static int do_prog(int argc, char *argv[]);
+static int do_stat(int argc, char *argv[]);
+
+static int do_prog_load(int argc, char *argv[]);
+static int do_prog_unload(int argc, char *argv[]);
+
+static struct cmd main_cmds[] = {
+ { "prog", do_prog },
+ { "stat", do_stat },
+};
+
+static struct cmd prog_cmds[] = {
+ { "load", do_prog_load },
+ { "unload", do_prog_unload },
+};
+
+static char *elf_name;
+
+static int dispatch_cmd(struct cmd cmds[], int ncmd, int argc,
+ char *argv[], void (*help)(void))
+{
+ int i;
+ int ret;
+
+ if (argc <= 0) {
+ help();
+ return -1;
+ }
+
+ for (i = 0; i < ncmd; i++) {
+ if (!strcmp(argv[0], cmds[i].name)) {
+ ret = cmds[i].func(argc - 1, argv + 1);
+ if (ret == -2) {
+ help();
+ ret = -1;
+ }
+ return ret;
+ }
+ }
+
+ help();
+
+ return -1;
+}
+
+static int do_prog_load(int argc, char *argv[])
+{
+ if (parse_load_args(argc + 1, argv - 1) < 0)
+ return -2;
+
+ if (setup_bpf())
+ return -1;
+
+ return 0;
+}
+
+static int do_prog_unload(int argc, char *argv[])
+{
+ int i;
+ int prog_fd;
+ int cgroup_fd;
+ char *cgroup_path = DEFAULT_CGROUP_PATH;
+
+ if (argc > 1)
+ cgroup_path = argv[0];
+
+ cgroup_fd = open(cgroup_path, O_DIRECTORY, O_RDONLY);
+ if (cgroup_fd < 0) {
+ fprintf(stderr, "failed to open cgroup path: %s\n",
+ cgroup_path);
+ return -1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(prog_infos); i++) {
+ if (prog_infos[i].attach_type == BPF_CGROUP_SOCK_OPS) {
+ prog_fd = bpf_obj_get(prog_infos[i].pin_path);
+ if (prog_fd >= 0)
+ bpf_prog_detach2(prog_fd, cgroup_fd,
+ BPF_CGROUP_SOCK_OPS);
+ }
+ unlink(prog_infos[i].pin_path);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(map_infos); i++)
+ unlink(map_infos[i].pin_path);
+
+ return 0;
+}
+
+static void do_prog_help(void)
+{
+ fprintf(stderr,
+ "Usage: %s prog load [-c CGROUP_PATH] [-p LISTEN_PORT] {BPF_FILE}\n"
+ " %s prog unload [CGROUP_PATH]\n",
+ elf_name, elf_name);
+}
+
+static int do_prog(int argc, char *argv[])
+{
+ return dispatch_cmd(prog_cmds, ARRAY_SIZE(prog_cmds),
+ argc, argv, do_prog_help);
+}
+
+static int do_stat(int argc, char *argv[])
+{
+ int i;
+ int fd;
+ int err;
+ int ncpu;
+ bool found = false;
+ struct bmc_map_info *info;
+ struct bpf_map_info map = {};
+ struct redis_bmc_stat stat = {};
+ __u32 len = sizeof(map);
+
+ ncpu = sysconf(_SC_NPROCESSORS_ONLN);
+ if (ncpu < 0) {
+ fprintf(stderr, "sysconf failed: %s\n", strerror(errno));
+ return -1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(map_infos); i++) {
+ info = &map_infos[i];
+ if (info->is_stat_map) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ fprintf(stderr, "no stats map found\n");
+ return -1;
+ }
+
+ fd = bpf_obj_get(info->pin_path);
+ if (fd < 0) {
+ fprintf(stderr, "failed to open %s\n",
+ info->pin_path);
+ return -1;
+ }
+
+ err = bpf_obj_get_info_by_fd(fd, &map, &len);
+ if (err) {
+ fprintf(stderr, "failed to get map info\n");
+ goto error;
+ }
+
+ if (map.type != BPF_MAP_TYPE_ARRAY) {
+ fprintf(stderr, "unexpected map type: %d\n", map.type);
+ goto error;
+ }
+
+ if (map.key_size != sizeof(__u32)) {
+ fprintf(stderr, "unexpected map key_size: %u\n", map.key_size);
+ goto error;
+ }
+
+ if (map.value_size != sizeof(struct redis_bmc_stat)) {
+ fprintf(stderr, "unexpected map key_size: %u\n", map.key_size);
+ goto error;
+ }
+
+ for (int i = 0; i < ncpu; i++) {
+ __u32 key = i;
+ struct redis_bmc_stat value;
+
+ err = bpf_map_lookup_elem(fd, &key, &value);
+ if (err) {
+ fprintf(stderr, "lookup cpu stat failed, cpu=%u\n", i);
+ goto error;
+ }
+ stat.total_get_requests += value.total_get_requests;
+ stat.hit_get_requests += value.hit_get_requests;
+ stat.drop_get_requests += value.drop_get_requests;
+ stat.total_set_requests += value.total_set_requests;
+ stat.hit_set_requests += value.hit_set_requests;
+ stat.drop_set_requests += value.drop_set_requests;
+ }
+
+ printf("Total GET Requests: %llu\n", stat.total_get_requests);
+ printf("Hit GET Requests: %llu (%.2f%%)\n", stat.hit_get_requests,
+ stat.total_get_requests == 0 ? 0 :
+ (double)stat.hit_get_requests /
+ (double)stat.total_get_requests *
+ 100);
+ printf("Dropped GET Requests: %llu (%.2lf%%)\n", stat.drop_get_requests,
+ stat.total_get_requests == 0 ? 0 :
+ (double)stat.drop_get_requests /
+ (double)stat.total_get_requests *
+ 100);
+
+ printf("Total SET Requests: %llu\n", stat.total_set_requests);
+ printf("Hit SET Requests: %llu (%.2f%%)\n", stat.hit_set_requests,
+ stat.total_set_requests == 0 ? 0 :
+ (double)stat.hit_set_requests /
+ (double)stat.total_set_requests *
+ 100);
+ printf("Dropped SET Requests: %llu (%.2lf%%)\n", stat.drop_set_requests,
+ stat.total_set_requests == 0 ? 0 :
+ (double)stat.drop_set_requests /
+ (double)stat.total_set_requests *
+ 100);
+
+ close(fd);
+
+ return 0;
+
+error:
+ close(fd);
+ return -1;
+}
+
+static void do_main_help(void)
+{
+ fprintf(stderr,
+ "Usage: %s OBJECT { COMMAND | help }\n"
+ " OBJECT := { prog | stat }\n",
+ elf_name);
+}
+
+int main(int argc, char *argv[])
+{
+ elf_name = argv[0];
+
+ return dispatch_cmd(main_cmds, ARRAY_SIZE(main_cmds),
+ argc - 1, argv + 1, do_main_help);
+}
--
2.20.1
1
0

[PATCH openEuler-1.0-LTS 1/2] mm/slub: add missing TID updates on slab deactivation
by Yongqiang Liu 12 Aug '22
by Yongqiang Liu 12 Aug '22
12 Aug '22
From: Jann Horn <jannh(a)google.com>
stable inclusion
from stable-4.19.252
commit e2b2f0e2e34d71ae6c2a1114fd3c525930e84bc7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5LJH1
CVE: NA
--------------------------------
commit eeaa345e128515135ccb864c04482180c08e3259 upstream.
The fastpath in slab_alloc_node() assumes that c->slab is stable as long as
the TID stays the same. However, two places in __slab_alloc() currently
don't update the TID when deactivating the CPU slab.
If multiple operations race the right way, this could lead to an object
getting lost; or, in an even more unlikely situation, it could even lead to
an object being freed onto the wrong slab's freelist, messing up the
`inuse` counter and eventually causing a page to be freed to the page
allocator while it still contains slab objects.
(I haven't actually tested these cases though, this is just based on
looking at the code. Writing testcases for this stuff seems like it'd be
a pain...)
The race leading to state inconsistency is (all operations on the same CPU
and kmem_cache):
- task A: begin do_slab_free():
- read TID
- read pcpu freelist (==NULL)
- check `slab == c->slab` (true)
- [PREEMPT A->B]
- task B: begin slab_alloc_node():
- fastpath fails (`c->freelist` is NULL)
- enter __slab_alloc()
- slub_get_cpu_ptr() (disables preemption)
- enter ___slab_alloc()
- take local_lock_irqsave()
- read c->freelist as NULL
- get_freelist() returns NULL
- write `c->slab = NULL`
- drop local_unlock_irqrestore()
- goto new_slab
- slub_percpu_partial() is NULL
- get_partial() returns NULL
- slub_put_cpu_ptr() (enables preemption)
- [PREEMPT B->A]
- task A: finish do_slab_free():
- this_cpu_cmpxchg_double() succeeds()
- [CORRUPT STATE: c->slab==NULL, c->freelist!=NULL]
From there, the object on c->freelist will get lost if task B is allowed to
continue from here: It will proceed to the retry_load_slab label,
set c->slab, then jump to load_freelist, which clobbers c->freelist.
But if we instead continue as follows, we get worse corruption:
- task A: run __slab_free() on object from other struct slab:
- CPU_PARTIAL_FREE case (slab was on no list, is now on pcpu partial)
- task A: run slab_alloc_node() with NUMA node constraint:
- fastpath fails (c->slab is NULL)
- call __slab_alloc()
- slub_get_cpu_ptr() (disables preemption)
- enter ___slab_alloc()
- c->slab is NULL: goto new_slab
- slub_percpu_partial() is non-NULL
- set c->slab to slub_percpu_partial(c)
- [CORRUPT STATE: c->slab points to slab-1, c->freelist has objects
from slab-2]
- goto redo
- node_match() fails
- goto deactivate_slab
- existing c->freelist is passed into deactivate_slab()
- inuse count of slab-1 is decremented to account for object from
slab-2
At this point, the inuse count of slab-1 is 1 lower than it should be.
This means that if we free all allocated objects in slab-1 except for one,
SLUB will think that slab-1 is completely unused, and may free its page,
leading to use-after-free.
Fixes: c17dda40a6a4e ("slub: Separate out kmem_cache_cpu processing from deactivate_slab")
Fixes: 03e404af26dc2 ("slub: fast release on full slab")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jann Horn <jannh(a)google.com>
Acked-by: Christoph Lameter <cl(a)linux.com>
Acked-by: David Rientjes <rientjes(a)google.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo(a)gmail.com>
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Link: https://lore.kernel.org/r/20220608182205.2945720-1-jannh@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
mm/slub.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index 7b5630ca9274..4bc29bcd0d5d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2168,6 +2168,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
c->page = NULL;
c->freelist = NULL;
+ c->tid = next_tid(c->tid);
}
/*
@@ -2301,8 +2302,6 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
deactivate_slab(s, c->page, c->freelist, c);
-
- c->tid = next_tid(c->tid);
}
/*
@@ -2589,6 +2588,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
if (!freelist) {
c->page = NULL;
+ c->tid = next_tid(c->tid);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
--
2.25.1
1
1
您好!
Kernel SIG 邀请您参加 2022-08-12 14:00 召开的Zoom会议(自动录制)
会议主题:openEuler Kernel SIG例会
会议内容:
1.英特尔Sapphire Rapids平台PMU新特性介绍
2.基于BPF内核缓存的Redis加速特性评审
3.BPF CO-RE(Compile Once-Run Everywhere)特性评审
会议链接:https://us06web.zoom.us/j/89836175849?pwd=ODlUNVhldkdndnN0b21VRUIxNkg0dz09
会议纪要:https://etherpad.openeuler.org/p/Kernel-meetings
温馨提醒:建议接入会议后修改参会人的姓名,也可以使用您在gitee.com的ID
更多资讯尽在:https://openeuler.org/zh/
Hello!
openEuler Kernel SIG invites you to attend the Zoom conference(auto recording) will be held at 2022-08-12 14:00,
The subject of the conference is openEuler Kernel SIG例会,
Summary:
1.英特尔Sapphire Rapids平台PMU新特性介绍
2.基于BPF内核缓存的Redis加速特性评审
3.BPF CO-RE(Compile Once-Run Everywhere)特性评审
You can join the meeting at https://us06web.zoom.us/j/89836175849?pwd=ODlUNVhldkdndnN0b21VRUIxNkg0dz09.
Add topics at https://etherpad.openeuler.org/p/Kernel-meetings.
Note: You are advised to change the participant name after joining the conference or use your ID at gitee.com.
More information: https://openeuler.org/en/
1
0

10 Aug '22
From: Juergen Gross <jgross(a)suse.com>
stable inclusion
from stable-v5.10.132
commit 136d7987fcfdeca73ee3c6a29e48f99fdd0f4d87
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5JTYM
CVE: CVE-2022-36123
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
[ Upstream commit 38fa5479b41376dc9d7f57e71c83514285a25ca0 ]
The .brk section has the same properties as .bss: it is an alloc-only
section and should be cleared before being used.
Not doing so is especially a problem for Xen PV guests, as the
hypervisor will validate page tables (check for writable page tables
and hypervisor private bits) before accepting them to be used.
Make sure .brk is initially zero by letting clear_bss() clear the brk
area, too.
Signed-off-by: Juergen Gross <jgross(a)suse.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Link: https://lore.kernel.org/r/20220630071441.28576-3-jgross@suse.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: GONG, Ruiqi <gongruiqi1(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Reviewed-by: Wang Weiyang <wangweiyang2(a)huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai(a)huawei.com>
---
arch/x86/kernel/head64.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 05e117137b45..efe13ab366f4 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -419,6 +419,8 @@ static void __init clear_bss(void)
{
memset(__bss_start, 0,
(unsigned long) __bss_stop - (unsigned long) __bss_start);
+ memset(__brk_base, 0,
+ (unsigned long) __brk_limit - (unsigned long) __brk_base);
}
static unsigned long get_cmd_line_ptr(void)
--
2.20.1
1
0

[PATCH openEuler-5.10 01/15] drivers/perf: hisi: Add driver for HiSilicon PCIe PMU
by Zheng Zengkai 09 Aug '22
by Zheng Zengkai 09 Aug '22
09 Aug '22
From: Qi Liu <liuqi115(a)huawei.com>
mainline inclusion
from mainline-v5.17-rc1
commit 8404b0fbc7fb
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5AZ87
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------------------------------------------------
PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
to sample bandwidth, latency, buffer occupation etc.
Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
registered as a PMU in /sys/bus/event_source/devices, so users can
select target PMU, and use filter to do further sets.
Filtering options contains:
event - select the event.
port - select target Root Ports. Information of Root Ports are
shown under sysfs.
bdf - select requester_id of target EP device.
trig_len - set trigger condition for starting event statistics.
trig_mode - set trigger mode. 0 means starting to statistic when bigger
than trigger condition, and 1 means smaller.
thr_len - set threshold for statistics.
thr_mode - set threshold mode. 0 means count when bigger than threshold,
and 1 means smaller.
Acked-by: Krzysztof Wilczyński <kw(a)linux.com>
Reviewed-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Qi Liu <liuqi115(a)huawei.com>
Reviewed-by: Shaokun Zhang <zhangshaokun(a)hisilicon.com>
Link: https://lore.kernel.org/r/20211202080633.2919-3-liuqi115@huawei.com
Signed-off-by: Will Deacon <will(a)kernel.org>
Signed-off-by: Wangming Shao <shaowangming(a)h-partners.com>
Reviewed-by: Junhao He <hejunhao3(a)huawei.com>
Reviewed-by: Yang Jihong <yangjihong1(a)huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai(a)huawei.com>
---
MAINTAINERS | 2 +
drivers/perf/hisilicon/Kconfig | 9 +
drivers/perf/hisilicon/Makefile | 2 +
drivers/perf/hisilicon/hisi_pcie_pmu.c | 951 +++++++++++++++++++++++++
include/linux/cpuhotplug.h | 3 +
5 files changed, 967 insertions(+)
create mode 100644 drivers/perf/hisilicon/hisi_pcie_pmu.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 466b1c599848..9908e5442110 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7976,8 +7976,10 @@ F: Documentation/devicetree/bindings/misc/hisilicon-hikey-usb.yaml
HISILICON PMU DRIVER
M: Shaokun Zhang <zhangshaokun(a)hisilicon.com>
+M: Qi Liu <liuqi115(a)huawei.com>
S: Supported
W: http://www.hisilicon.com
+F: Documentation/admin-guide/perf/hisi-pcie-pmu.rst
F: Documentation/admin-guide/perf/hisi-pmu.rst
F: drivers/perf/hisilicon
diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig
index c5d1b7019fff..5546218b5598 100644
--- a/drivers/perf/hisilicon/Kconfig
+++ b/drivers/perf/hisilicon/Kconfig
@@ -5,3 +5,12 @@ config HISI_PMU
help
Support for HiSilicon SoC L3 Cache performance monitor, Hydra Home
Agent performance monitor and DDR Controller performance monitor.
+
+config HISI_PCIE_PMU
+ tristate "HiSilicon PCIE PERF PMU"
+ depends on PCI && ARM64
+ help
+ Provide support for HiSilicon PCIe performance monitoring unit (PMU)
+ RCiEP devices.
+ Adds the PCIe PMU into perf events system for monitoring latency,
+ bandwidth etc.
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 22e384cdfd53..ad0e8110f373 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -4,3 +4,5 @@ obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
hisi_uncore_pa_pmu.o \
hisi_uncore_l3t_pmu.o \
hisi_uncore_lpddrc_pmu.o
+
+obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
new file mode 100644
index 000000000000..2f18838754ec
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -0,0 +1,951 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This driver adds support for PCIe PMU RCiEP device. Related
+ * perf events are bandwidth, latency etc.
+ *
+ * Copyright (C) 2021 HiSilicon Limited
+ * Author: Qi Liu <liuqi115(a)huawei.com>
+ */
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+
+#define DRV_NAME "hisi_pcie_pmu"
+/* Define registers */
+#define HISI_PCIE_GLOBAL_CTRL 0x00
+#define HISI_PCIE_EVENT_CTRL 0x010
+#define HISI_PCIE_CNT 0x090
+#define HISI_PCIE_EXT_CNT 0x110
+#define HISI_PCIE_INT_STAT 0x150
+#define HISI_PCIE_INT_MASK 0x154
+#define HISI_PCIE_REG_BDF 0xfe0
+#define HISI_PCIE_REG_VERSION 0xfe4
+#define HISI_PCIE_REG_INFO 0xfe8
+
+/* Define command in HISI_PCIE_GLOBAL_CTRL */
+#define HISI_PCIE_GLOBAL_EN 0x01
+#define HISI_PCIE_GLOBAL_NONE 0
+
+/* Define command in HISI_PCIE_EVENT_CTRL */
+#define HISI_PCIE_EVENT_EN BIT_ULL(20)
+#define HISI_PCIE_RESET_CNT BIT_ULL(22)
+#define HISI_PCIE_INIT_SET BIT_ULL(34)
+#define HISI_PCIE_THR_EN BIT_ULL(26)
+#define HISI_PCIE_TARGET_EN BIT_ULL(32)
+#define HISI_PCIE_TRIG_EN BIT_ULL(52)
+
+/* Define offsets in HISI_PCIE_EVENT_CTRL */
+#define HISI_PCIE_EVENT_M GENMASK_ULL(15, 0)
+#define HISI_PCIE_THR_MODE_M GENMASK_ULL(27, 27)
+#define HISI_PCIE_THR_M GENMASK_ULL(31, 28)
+#define HISI_PCIE_TARGET_M GENMASK_ULL(52, 36)
+#define HISI_PCIE_TRIG_MODE_M GENMASK_ULL(53, 53)
+#define HISI_PCIE_TRIG_M GENMASK_ULL(59, 56)
+
+#define HISI_PCIE_MAX_COUNTERS 8
+#define HISI_PCIE_REG_STEP 8
+#define HISI_PCIE_THR_MAX_VAL 10
+#define HISI_PCIE_TRIG_MAX_VAL 10
+#define HISI_PCIE_MAX_PERIOD (GENMASK_ULL(63, 0))
+#define HISI_PCIE_INIT_VAL BIT_ULL(63)
+
+struct hisi_pcie_pmu {
+ struct perf_event *hw_events[HISI_PCIE_MAX_COUNTERS];
+ struct hlist_node node;
+ struct pci_dev *pdev;
+ struct pmu pmu;
+ void __iomem *base;
+ int irq;
+ u32 identifier;
+ /* Minimum and maximum BDF of root ports monitored by PMU */
+ u16 bdf_min;
+ u16 bdf_max;
+ int on_cpu;
+};
+
+struct hisi_pcie_reg_pair {
+ u16 lo;
+ u16 hi;
+};
+
+#define to_pcie_pmu(p) (container_of((p), struct hisi_pcie_pmu, pmu))
+#define GET_PCI_DEVFN(bdf) ((bdf) & 0xff)
+
+#define HISI_PCIE_PMU_FILTER_ATTR(_name, _config, _hi, _lo) \
+ static u64 hisi_pcie_get_##_name(struct perf_event *event) \
+ { \
+ return FIELD_GET(GENMASK(_hi, _lo), event->attr._config); \
+ } \
+
+HISI_PCIE_PMU_FILTER_ATTR(event, config, 16, 0);
+HISI_PCIE_PMU_FILTER_ATTR(thr_len, config1, 3, 0);
+HISI_PCIE_PMU_FILTER_ATTR(thr_mode, config1, 4, 4);
+HISI_PCIE_PMU_FILTER_ATTR(trig_len, config1, 8, 5);
+HISI_PCIE_PMU_FILTER_ATTR(trig_mode, config1, 9, 9);
+HISI_PCIE_PMU_FILTER_ATTR(port, config2, 15, 0);
+HISI_PCIE_PMU_FILTER_ATTR(bdf, config2, 31, 16);
+
+static ssize_t hisi_pcie_format_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct dev_ext_attribute *eattr;
+
+ eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+ return sysfs_emit(buf, "%s\n", (char *)eattr->var);
+}
+
+static ssize_t hisi_pcie_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ return sysfs_emit(buf, "config=0x%llx\n", pmu_attr->id);
+}
+
+#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format) \
+ (&((struct dev_ext_attribute[]){ \
+ { .attr = __ATTR(_name, 0444, hisi_pcie_format_sysfs_show, \
+ NULL), \
+ .var = (void *)_format } \
+ })[0].attr.attr)
+
+#define HISI_PCIE_PMU_EVENT_ATTR(_name, _id) \
+ (&((struct perf_pmu_events_attr[]) { \
+ { .attr = __ATTR(_name, 0444, hisi_pcie_event_sysfs_show, NULL), \
+ .id = _id, } \
+ })[0].attr.attr)
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+ return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static ssize_t identifier_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "%#x\n", pcie_pmu->identifier);
+}
+static DEVICE_ATTR_RO(identifier);
+
+static ssize_t bus_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "%#04x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
+}
+static DEVICE_ATTR_RO(bus);
+
+static struct hisi_pcie_reg_pair
+hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu, u32 reg_off)
+{
+ u32 val = readl_relaxed(pcie_pmu->base + reg_off);
+ struct hisi_pcie_reg_pair regs = {
+ .lo = val,
+ .hi = val >> 16,
+ };
+
+ return regs;
+}
+
+/*
+ * Hardware counter and ext_counter work together for bandwidth, latency, bus
+ * utilization and buffer occupancy events. For example, RX memory write latency
+ * events(index = 0x0010), counter counts total delay cycles and ext_counter
+ * counts RX memory write PCIe packets number.
+ *
+ * As we don't want PMU driver to process these two data, "delay cycles" can
+ * be treated as an independent event(index = 0x0010), "RX memory write packets
+ * number" as another(index = 0x10010). BIT 16 is used to distinguish and 0-15
+ * bits are "real" event index, which can be used to set HISI_PCIE_EVENT_CTRL.
+ */
+#define EXT_COUNTER_IS_USED(idx) ((idx) & BIT(16))
+
+static u32 hisi_pcie_get_real_event(struct perf_event *event)
+{
+ return hisi_pcie_get_event(event) & GENMASK(15, 0);
+}
+
+static u32 hisi_pcie_pmu_get_offset(u32 offset, u32 idx)
+{
+ return offset + HISI_PCIE_REG_STEP * idx;
+}
+
+static u32 hisi_pcie_pmu_readl(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
+ u32 idx)
+{
+ u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+ return readl_relaxed(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writel(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u32 val)
+{
+ u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+ writel_relaxed(val, pcie_pmu->base + offset);
+}
+
+static u64 hisi_pcie_pmu_readq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx)
+{
+ u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+ return readq_relaxed(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u64 val)
+{
+ u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+ writeq_relaxed(val, pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_config_filter(struct perf_event *event)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 reg = HISI_PCIE_INIT_SET;
+ u64 port, trig_len, thr_len;
+
+ /* Config HISI_PCIE_EVENT_CTRL according to event. */
+ reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_real_event(event));
+
+ /* Config HISI_PCIE_EVENT_CTRL according to root port or EP device. */
+ port = hisi_pcie_get_port(event);
+ if (port)
+ reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port);
+ else
+ reg |= HISI_PCIE_TARGET_EN |
+ FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event));
+
+ /* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */
+ trig_len = hisi_pcie_get_trig_len(event);
+ if (trig_len) {
+ reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len);
+ reg |= FIELD_PREP(HISI_PCIE_TRIG_MODE_M, hisi_pcie_get_trig_mode(event));
+ reg |= HISI_PCIE_TRIG_EN;
+ }
+
+ /* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */
+ thr_len = hisi_pcie_get_thr_len(event);
+ if (thr_len) {
+ reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len);
+ reg |= FIELD_PREP(HISI_PCIE_THR_MODE_M, hisi_pcie_get_thr_mode(event));
+ reg |= HISI_PCIE_THR_EN;
+ }
+
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, reg);
+}
+
+static void hisi_pcie_pmu_clear_filter(struct perf_event *event)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, HISI_PCIE_INIT_SET);
+}
+
+static bool hisi_pcie_pmu_valid_requester_id(struct hisi_pcie_pmu *pcie_pmu, u32 bdf)
+{
+ struct pci_dev *root_port, *pdev;
+ u16 rp_bdf;
+
+ pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pcie_pmu->pdev->bus), PCI_BUS_NUM(bdf),
+ GET_PCI_DEVFN(bdf));
+ if (!pdev)
+ return false;
+
+ root_port = pcie_find_root_port(pdev);
+ if (!root_port) {
+ pci_dev_put(pdev);
+ return false;
+ }
+
+ pci_dev_put(pdev);
+ rp_bdf = pci_dev_id(root_port);
+ return rp_bdf >= pcie_pmu->bdf_min && rp_bdf <= pcie_pmu->bdf_max;
+}
+
+static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
+ struct hisi_pcie_pmu *pcie_pmu)
+{
+ u32 requester_id = hisi_pcie_get_bdf(event);
+
+ if (hisi_pcie_get_thr_len(event) > HISI_PCIE_THR_MAX_VAL)
+ return false;
+
+ if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL)
+ return false;
+
+ if (requester_id) {
+ if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id))
+ return false;
+ }
+
+ return true;
+}
+
+static bool hisi_pcie_pmu_cmp_event(struct perf_event *target,
+ struct perf_event *event)
+{
+ return hisi_pcie_get_real_event(target) == hisi_pcie_get_real_event(event);
+}
+
+static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+ struct perf_event *event_group[HISI_PCIE_MAX_COUNTERS];
+ int counters = 1;
+ int num;
+
+ event_group[0] = leader;
+ if (!is_software_event(leader)) {
+ if (leader->pmu != event->pmu)
+ return false;
+
+ if (leader != event && !hisi_pcie_pmu_cmp_event(leader, event))
+ event_group[counters++] = event;
+ }
+
+ for_each_sibling_event(sibling, event->group_leader) {
+ if (is_software_event(sibling))
+ continue;
+
+ if (sibling->pmu != event->pmu)
+ return false;
+
+ for (num = 0; num < counters; num++) {
+ if (hisi_pcie_pmu_cmp_event(event_group[num], sibling))
+ break;
+ }
+
+ if (num == counters)
+ event_group[counters++] = sibling;
+ }
+
+ return counters <= HISI_PCIE_MAX_COUNTERS;
+}
+
+static int hisi_pcie_pmu_event_init(struct perf_event *event)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ event->cpu = pcie_pmu->on_cpu;
+
+ if (EXT_COUNTER_IS_USED(hisi_pcie_get_event(event)))
+ hwc->event_base = HISI_PCIE_EXT_CNT;
+ else
+ hwc->event_base = HISI_PCIE_CNT;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* Sampling is not supported. */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+ return -EOPNOTSUPP;
+
+ if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu))
+ return -EINVAL;
+
+ if (!hisi_pcie_pmu_validate_event_group(event))
+ return -EINVAL;
+
+ return 0;
+}
+
+static u64 hisi_pcie_pmu_read_counter(struct perf_event *event)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ u32 idx = event->hw.idx;
+
+ return hisi_pcie_pmu_readq(pcie_pmu, event->hw.event_base, idx);
+}
+
+static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu,
+ struct perf_event *event)
+{
+ struct perf_event *sibling;
+ int idx;
+
+ for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+ sibling = pcie_pmu->hw_events[idx];
+ if (!sibling)
+ continue;
+
+ if (!hisi_pcie_pmu_cmp_event(sibling, event))
+ continue;
+
+ /* Related events must be used in group */
+ if (sibling->group_leader == event->group_leader)
+ return idx;
+ else
+ return -EINVAL;
+ }
+
+ return idx;
+}
+
+static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu)
+{
+ int idx;
+
+ for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+ if (!pcie_pmu->hw_events[idx])
+ return idx;
+ }
+
+ return -EINVAL;
+}
+
+static void hisi_pcie_pmu_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 new_cnt, prev_cnt, delta;
+
+ do {
+ prev_cnt = local64_read(&hwc->prev_count);
+ new_cnt = hisi_pcie_pmu_read_counter(event);
+ } while (local64_cmpxchg(&hwc->prev_count, prev_cnt,
+ new_cnt) != prev_cnt);
+
+ delta = (new_cnt - prev_cnt) & HISI_PCIE_MAX_PERIOD;
+ local64_add(delta, &event->count);
+}
+
+static void hisi_pcie_pmu_read(struct perf_event *event)
+{
+ hisi_pcie_pmu_event_update(event);
+}
+
+static void hisi_pcie_pmu_set_period(struct perf_event *event)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ local64_set(&hwc->prev_count, HISI_PCIE_INIT_VAL);
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_CNT, idx, HISI_PCIE_INIT_VAL);
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EXT_CNT, idx, HISI_PCIE_INIT_VAL);
+}
+
+static void hisi_pcie_pmu_enable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+ u64 val;
+
+ val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+ val |= HISI_PCIE_EVENT_EN;
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_disable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+ u64 val;
+
+ val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+ val &= ~HISI_PCIE_EVENT_EN;
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_enable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+
+ hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 0);
+}
+
+static void hisi_pcie_pmu_disable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+
+ hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 1);
+}
+
+static void hisi_pcie_pmu_reset_counter(struct hisi_pcie_pmu *pcie_pmu, int idx)
+{
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_RESET_CNT);
+ hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_INIT_SET);
+}
+
+static void hisi_pcie_pmu_start(struct perf_event *event, int flags)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+ u64 prev_cnt;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ hisi_pcie_pmu_config_filter(event);
+ hisi_pcie_pmu_enable_counter(pcie_pmu, hwc);
+ hisi_pcie_pmu_enable_int(pcie_pmu, hwc);
+ hisi_pcie_pmu_set_period(event);
+
+ if (flags & PERF_EF_RELOAD) {
+ prev_cnt = local64_read(&hwc->prev_count);
+ hisi_pcie_pmu_writeq(pcie_pmu, hwc->event_base, idx, prev_cnt);
+ }
+
+ perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_stop(struct perf_event *event, int flags)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hisi_pcie_pmu_event_update(event);
+ hisi_pcie_pmu_disable_int(pcie_pmu, hwc);
+ hisi_pcie_pmu_disable_counter(pcie_pmu, hwc);
+ hisi_pcie_pmu_clear_filter(event);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int hisi_pcie_pmu_add(struct perf_event *event, int flags)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+ /* Check all working events to find a related event. */
+ idx = hisi_pcie_pmu_find_related_event(pcie_pmu, event);
+ if (idx < 0)
+ return idx;
+
+ /* Current event shares an enabled counter with the related event */
+ if (idx < HISI_PCIE_MAX_COUNTERS) {
+ hwc->idx = idx;
+ goto start_count;
+ }
+
+ idx = hisi_pcie_pmu_get_event_idx(pcie_pmu);
+ if (idx < 0)
+ return idx;
+
+ hwc->idx = idx;
+ pcie_pmu->hw_events[idx] = event;
+ /* Reset Counter to avoid previous statistic interference. */
+ hisi_pcie_pmu_reset_counter(pcie_pmu, idx);
+
+start_count:
+ if (flags & PERF_EF_START)
+ hisi_pcie_pmu_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void hisi_pcie_pmu_del(struct perf_event *event, int flags)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hisi_pcie_pmu_stop(event, PERF_EF_UPDATE);
+ pcie_pmu->hw_events[hwc->idx] = NULL;
+ perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_enable(struct pmu *pmu)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+ int num;
+
+ for (num = 0; num < HISI_PCIE_MAX_COUNTERS; num++) {
+ if (pcie_pmu->hw_events[num])
+ break;
+ }
+
+ if (num == HISI_PCIE_MAX_COUNTERS)
+ return;
+
+ writel(HISI_PCIE_GLOBAL_EN, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static void hisi_pcie_pmu_disable(struct pmu *pmu)
+{
+ struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+
+ writel(HISI_PCIE_GLOBAL_NONE, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static irqreturn_t hisi_pcie_pmu_irq(int irq, void *data)
+{
+ struct hisi_pcie_pmu *pcie_pmu = data;
+ irqreturn_t ret = IRQ_NONE;
+ struct perf_event *event;
+ u32 overflown;
+ int idx;
+
+ for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+ overflown = hisi_pcie_pmu_readl(pcie_pmu, HISI_PCIE_INT_STAT, idx);
+ if (!overflown)
+ continue;
+
+ /* Clear status of interrupt. */
+ hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_STAT, idx, 1);
+ event = pcie_pmu->hw_events[idx];
+ if (!event)
+ continue;
+
+ hisi_pcie_pmu_event_update(event);
+ hisi_pcie_pmu_set_period(event);
+ ret = IRQ_HANDLED;
+ }
+
+ return ret;
+}
+
+static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+ int irq, ret;
+
+ ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+ if (ret < 0) {
+ pci_err(pdev, "Failed to enable MSI vectors: %d\n", ret);
+ return ret;
+ }
+
+ irq = pci_irq_vector(pdev, 0);
+ ret = request_irq(irq, hisi_pcie_pmu_irq, IRQF_NOBALANCING | IRQF_NO_THREAD, DRV_NAME,
+ pcie_pmu);
+ if (ret) {
+ pci_err(pdev, "Failed to register IRQ: %d\n", ret);
+ pci_free_irq_vectors(pdev);
+ return ret;
+ }
+
+ pcie_pmu->irq = irq;
+
+ return 0;
+}
+
+static void hisi_pcie_pmu_irq_unregister(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+ free_irq(pcie_pmu->irq, pcie_pmu);
+ pci_free_irq_vectors(pdev);
+}
+
+static int hisi_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+ struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
+
+ if (pcie_pmu->on_cpu == -1) {
+ pcie_pmu->on_cpu = cpu;
+ WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(cpu)));
+ }
+
+ return 0;
+}
+
+static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+ struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
+ unsigned int target;
+
+ /* Nothing to do if this CPU doesn't own the PMU */
+ if (pcie_pmu->on_cpu != cpu)
+ return 0;
+
+ pcie_pmu->on_cpu = -1;
+ /* Choose a new CPU from all online cpus. */
+ target = cpumask_first(cpu_online_mask);
+ if (target >= nr_cpu_ids) {
+ pci_err(pcie_pmu->pdev, "There is no CPU to set\n");
+ return 0;
+ }
+
+ perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
+ /* Use this CPU for event counting */
+ pcie_pmu->on_cpu = target;
+ WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(target)));
+
+ return 0;
+}
+
+static struct attribute *hisi_pcie_pmu_events_attr[] = {
+ HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_latency, 0x0010),
+ HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_cnt, 0x10010),
+ HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_latency, 0x0210),
+ HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_cnt, 0x10210),
+ HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_latency, 0x0011),
+ HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_cnt, 0x10011),
+ HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_flux, 0x1005),
+ HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_time, 0x11005),
+ HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_flux, 0x2004),
+ HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_time, 0x12004),
+ NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_events_group = {
+ .name = "events",
+ .attrs = hisi_pcie_pmu_events_attr,
+};
+
+static struct attribute *hisi_pcie_pmu_format_attr[] = {
+ HISI_PCIE_PMU_FORMAT_ATTR(event, "config:0-16"),
+ HISI_PCIE_PMU_FORMAT_ATTR(thr_len, "config1:0-3"),
+ HISI_PCIE_PMU_FORMAT_ATTR(thr_mode, "config1:4"),
+ HISI_PCIE_PMU_FORMAT_ATTR(trig_len, "config1:5-8"),
+ HISI_PCIE_PMU_FORMAT_ATTR(trig_mode, "config1:9"),
+ HISI_PCIE_PMU_FORMAT_ATTR(port, "config2:0-15"),
+ HISI_PCIE_PMU_FORMAT_ATTR(bdf, "config2:16-31"),
+ NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_format_group = {
+ .name = "format",
+ .attrs = hisi_pcie_pmu_format_attr,
+};
+
+static struct attribute *hisi_pcie_pmu_bus_attrs[] = {
+ &dev_attr_bus.attr,
+ NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_bus_attr_group = {
+ .attrs = hisi_pcie_pmu_bus_attrs,
+};
+
+static struct attribute *hisi_pcie_pmu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_cpumask_attr_group = {
+ .attrs = hisi_pcie_pmu_cpumask_attrs,
+};
+
+static struct attribute *hisi_pcie_pmu_identifier_attrs[] = {
+ &dev_attr_identifier.attr,
+ NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_identifier_attr_group = {
+ .attrs = hisi_pcie_pmu_identifier_attrs,
+};
+
+static const struct attribute_group *hisi_pcie_pmu_attr_groups[] = {
+ &hisi_pcie_pmu_events_group,
+ &hisi_pcie_pmu_format_group,
+ &hisi_pcie_pmu_bus_attr_group,
+ &hisi_pcie_pmu_cpumask_attr_group,
+ &hisi_pcie_pmu_identifier_attr_group,
+ NULL
+};
+
+static int hisi_pcie_alloc_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+ struct hisi_pcie_reg_pair regs;
+ u16 sicl_id, core_id;
+ char *name;
+
+ regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_BDF);
+ pcie_pmu->bdf_min = regs.lo;
+ pcie_pmu->bdf_max = regs.hi;
+
+ regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_INFO);
+ sicl_id = regs.hi;
+ core_id = regs.lo;
+
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_pcie%u_core%u", sicl_id, core_id);
+ if (!name)
+ return -ENOMEM;
+
+ pcie_pmu->pdev = pdev;
+ pcie_pmu->on_cpu = -1;
+ pcie_pmu->identifier = readl(pcie_pmu->base + HISI_PCIE_REG_VERSION);
+ pcie_pmu->pmu = (struct pmu) {
+ .name = name,
+ .module = THIS_MODULE,
+ .event_init = hisi_pcie_pmu_event_init,
+ .pmu_enable = hisi_pcie_pmu_enable,
+ .pmu_disable = hisi_pcie_pmu_disable,
+ .add = hisi_pcie_pmu_add,
+ .del = hisi_pcie_pmu_del,
+ .start = hisi_pcie_pmu_start,
+ .stop = hisi_pcie_pmu_stop,
+ .read = hisi_pcie_pmu_read,
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = hisi_pcie_pmu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ };
+
+ return 0;
+}
+
+static int hisi_pcie_init_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+ int ret;
+
+ pcie_pmu->base = pci_ioremap_bar(pdev, 2);
+ if (!pcie_pmu->base) {
+ pci_err(pdev, "Ioremap failed for pcie_pmu resource\n");
+ return -ENOMEM;
+ }
+
+ ret = hisi_pcie_alloc_pmu(pdev, pcie_pmu);
+ if (ret)
+ goto err_iounmap;
+
+ ret = hisi_pcie_pmu_irq_register(pdev, pcie_pmu);
+ if (ret)
+ goto err_iounmap;
+
+ ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+ if (ret) {
+ pci_err(pdev, "Failed to register hotplug: %d\n", ret);
+ goto err_irq_unregister;
+ }
+
+ ret = perf_pmu_register(&pcie_pmu->pmu, pcie_pmu->pmu.name, -1);
+ if (ret) {
+ pci_err(pdev, "Failed to register PCIe PMU: %d\n", ret);
+ goto err_hotplug_unregister;
+ }
+
+ return ret;
+
+err_hotplug_unregister:
+ cpuhp_state_remove_instance_nocalls(
+ CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+
+err_irq_unregister:
+ hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+
+err_iounmap:
+ iounmap(pcie_pmu->base);
+
+ return ret;
+}
+
+static void hisi_pcie_uninit_pmu(struct pci_dev *pdev)
+{
+ struct hisi_pcie_pmu *pcie_pmu = pci_get_drvdata(pdev);
+
+ perf_pmu_unregister(&pcie_pmu->pmu);
+ cpuhp_state_remove_instance_nocalls(
+ CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+ hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+ iounmap(pcie_pmu->base);
+}
+
+static int hisi_pcie_init_dev(struct pci_dev *pdev)
+{
+ int ret;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ pci_err(pdev, "Failed to enable PCI device: %d\n", ret);
+ return ret;
+ }
+
+ ret = pcim_iomap_regions(pdev, BIT(2), DRV_NAME);
+ if (ret < 0) {
+ pci_err(pdev, "Failed to request PCI mem regions: %d\n", ret);
+ return ret;
+ }
+
+ pci_set_master(pdev);
+
+ return 0;
+}
+
+static int hisi_pcie_pmu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct hisi_pcie_pmu *pcie_pmu;
+ int ret;
+
+ pcie_pmu = devm_kzalloc(&pdev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
+ if (!pcie_pmu)
+ return -ENOMEM;
+
+ ret = hisi_pcie_init_dev(pdev);
+ if (ret)
+ return ret;
+
+ ret = hisi_pcie_init_pmu(pdev, pcie_pmu);
+ if (ret)
+ return ret;
+
+ pci_set_drvdata(pdev, pcie_pmu);
+
+ return ret;
+}
+
+static void hisi_pcie_pmu_remove(struct pci_dev *pdev)
+{
+ hisi_pcie_uninit_pmu(pdev);
+ pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id hisi_pcie_pmu_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa12d) },
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, hisi_pcie_pmu_ids);
+
+static struct pci_driver hisi_pcie_pmu_driver = {
+ .name = DRV_NAME,
+ .id_table = hisi_pcie_pmu_ids,
+ .probe = hisi_pcie_pmu_probe,
+ .remove = hisi_pcie_pmu_remove,
+};
+
+static int __init hisi_pcie_module_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+ "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE",
+ hisi_pcie_pmu_online_cpu,
+ hisi_pcie_pmu_offline_cpu);
+ if (ret) {
+ pr_err("Failed to setup PCIe PMU hotplug: %d\n", ret);
+ return ret;
+ }
+
+ ret = pci_register_driver(&hisi_pcie_pmu_driver);
+ if (ret)
+ cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+
+ return ret;
+}
+module_init(hisi_pcie_module_init);
+
+static void __exit hisi_pcie_module_exit(void)
+{
+ pci_unregister_driver(&hisi_pcie_pmu_driver);
+ cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+}
+module_exit(hisi_pcie_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon PCIe PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Qi Liu <liuqi115(a)huawei.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index b98b9eb7d5f8..24e7be132046 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -178,6 +178,9 @@ enum cpuhp_state {
CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+ #ifndef __GENKSYMS__
+ CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+ #endif
CPUHP_AP_PERF_ARM_L2X0_ONLINE,
CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
--
2.20.1
1
14
d1ac096f8869 mm: userfaultfd: fix missing cache flush in mcopy_atomic_pte() and
_mcopy_atomic()
c6cbf5431a62 mm: hugetlb: fix missing cache flush in copy_huge_page_from_user()
308ff6a6e768 mm: fix missing cache flush for all tail pages of compound page
185fa5984d7a Bluetooth: Fix the creation of hdev->name
9ff4a6b80642 arm: remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
dfb55dcf9d39 nfp: bpf: silence bitwise vs. logical OR warning
f89f76f4b0e7 drm/amd/display/dc/gpio/gpio_service: Pass around correct
dce{version, environment} types
efd1429fa99b block: drbd: drbd_nl: Make conversion to 'enum drbd_ret_code'
explicit
a71658c7db0b regulator: consumer: Add missing stubs to regulator/consumer.h
7648f42d1a62 MIPS: Use address-of operator on section symbols
Total patches: 10
Dmitry Osipenko (1):
regulator: consumer: Add missing stubs to regulator/consumer.h
Itay Iellin (1):
Bluetooth: Fix the creation of hdev->name
Lee Jones (2):
block: drbd: drbd_nl: Make conversion to 'enum drbd_ret_code' explicit
drm/amd/display/dc/gpio/gpio_service: Pass around correct
dce_{version, environment} types
Mike Rapoport (1):
arm: remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
Muchun Song (3):
mm: fix missing cache flush for all tail pages of compound page
mm: hugetlb: fix missing cache flush in copy_huge_page_from_user()
mm: userfaultfd: fix missing cache flush in mcopy_atomic_pte() and
__mcopy_atomic()
Nathan Chancellor (2):
MIPS: Use address-of operator on section symbols
nfp: bpf: silence bitwise vs. logical OR warning
Documentation/vm/memory-model.rst | 3 +-
arch/arm/Kconfig | 8 ++---
arch/arm/mach-bcm/Kconfig | 1 -
arch/arm/mach-davinci/Kconfig | 1 -
arch/arm/mach-exynos/Kconfig | 1 -
arch/arm/mach-highbank/Kconfig | 1 -
arch/arm/mach-omap2/Kconfig | 1 -
arch/arm/mach-s5pv210/Kconfig | 1 -
arch/arm/mach-tango/Kconfig | 1 -
arch/mips/bmips/setup.c | 2 +-
arch/mips/lantiq/prom.c | 2 +-
arch/mips/pic32/pic32mzda/init.c | 2 +-
arch/mips/ralink/of.c | 2 +-
drivers/block/drbd/drbd_nl.c | 13 +++++---
.../drm/amd/display/dc/gpio/gpio_service.c | 12 +++----
.../display/include/gpio_service_interface.h | 4 +--
drivers/net/ethernet/netronome/nfp/nfp_asm.c | 4 +--
fs/proc/kcore.c | 2 --
include/linux/mmzone.h | 31 -------------------
include/linux/regulator/consumer.h | 30 ++++++++++++++++++
include/net/bluetooth/hci_core.h | 3 ++
mm/memory.c | 2 ++
mm/migrate.c | 7 +++--
mm/mmzone.c | 14 ---------
mm/userfaultfd.c | 3 ++
mm/vmstat.c | 4 ---
net/bluetooth/hci_core.c | 6 ++--
27 files changed, 71 insertions(+), 90 deletions(-)
--
2.20.1
1
10

[PATCH openEuler-5.10-LTS 1/6] coresight: etm4x: Workaround CPU hung bug on HiSilicon ETM
by Zheng Zengkai 09 Aug '22
by Zheng Zengkai 09 Aug '22
09 Aug '22
From: Junhao He <hejunhao3(a)huawei.com>
driver inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5EZY2
------------------------------------------------------------------
In FIFO mode, when the state of sink buffer is full, the sink device will
continuously backpressures the ETM, so that the ETM cannot switch to the
idle state. In this case, the WFx instruction cannot be executed because
the CPU detects that the ETM is not in the idle state which that will
cause CPU hung.
We workaround this issue on HiSilicon ETM by setting bit 13 of TRCAUXCTLR
which is used to indicate that the ETM is in the idle state.
The call trace is shown below:
rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
rcu: 10-...0: (1 ticks this GP) idle=5b6/1/0x4000000000000000 softirq=12309/12318 fqs=114196
(detected by 67, t=330041 jiffies, g=309253, q=453663)
Task dump for CPU 10:
task:ksoftirqd/10 state:R running task stack: 0 pid: 64 ppid: 2 flags:0x0000000a
Call trace:
__switch_to+0xbc/0xfc
irqtime_account_irq+0x58/0xc4
__do_softirq+0x6c/0x358
run_ksoftirqd+0x68/0x90
smpboot_thread_fn+0x15c/0x1a0
kthread+0x108/0x13c
ret_from_fork+0x10/0x18
watchdog: BUG: soft lockup - CPU#35 stuck for 22s! [bash:133345]
...
Call trace:
smp_call_function_single+0x178/0x190
etm4_disable_sysfs+0x74/0xfc [coresight_etm4x]
etm4_disable+0x6c/0x70 [coresight_etm4x]
coresight_disable_source+0x7c/0xa0 [coresight]
coresight_disable+0x6c/0x13c [coresight]
enable_source_store+0x88/0xa0 [coresight]
dev_attr_store+0x20/0x34
sysfs_kf_write+0x4c/0x5c
kernfs_fop_write_iter+0x130/0x1c0
new_sync_write+0xec/0x18c
vfs_write+0x214/0x2ac
ksys_write+0x70/0xfc
__arm64_sys_write+0x24/0x30
el0_svc_common.constprop.0+0x7c/0x1bc
do_el0_svc+0x2c/0x94
el0_svc+0x20/0x30
el0_sync_handler+0xb0/0xb4
el0_sync+0x160/0x180
Signed-off-by: Qi Liu <liuqi115(a)huawei.com>
Signed-off-by: Junhao He <hejunhao3(a)huawei.com>
Reviewed-by: Jay Fang <f.fangjian(a)huawei.com>
Acked-by: Xie XiuQi <xiexiuqi(a)huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai(a)huawei.com>
---
.../coresight/coresight-etm4x-core.c | 37 +++++++++++++++----
drivers/hwtracing/coresight/coresight-etm4x.h | 1 +
2 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index d4d9c8bb88ca..881da29cbd5a 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -116,8 +116,10 @@ struct etm4_enable_arg {
#define HISI_HIP08_CORE_COMMIT_LVL_1 0b01
#define HISI_HIP08_CORE_COMMIT_REG sys_reg(3, 1, 15, 2, 5)
+#define HISI_HIP08_AUXCTRL_CHICKEN_BIT BIT(13)
+
struct etm4_arch_features {
- void (*arch_callback)(bool enable);
+ void (*arch_callback)(void *info);
};
static bool etm4_hisi_match_pid(unsigned int id)
@@ -125,8 +127,9 @@ static bool etm4_hisi_match_pid(unsigned int id)
return (id & ETM4_AMBA_MASK) == HISI_HIP08_AMBA_ID;
}
-static void etm4_hisi_config_core_commit(bool enable)
+static void etm4_hisi_config_core_commit(void *info)
{
+ bool enable = *(bool *)info;
u8 commit = enable ? HISI_HIP08_CORE_COMMIT_LVL_1 :
HISI_HIP08_CORE_COMMIT_FULL;
u64 val;
@@ -143,48 +146,67 @@ static void etm4_hisi_config_core_commit(bool enable)
write_sysreg_s(val, HISI_HIP08_CORE_COMMIT_REG);
}
+static void etm4_hisi_config_auxctrlr(void *info)
+{
+ struct etmv4_drvdata *drvdata = info;
+
+ /* Switch the ETM to idle state */
+ writel_relaxed(HISI_HIP08_AUXCTRL_CHICKEN_BIT, drvdata->base + TRCAUXCTLR);
+}
+
static struct etm4_arch_features etm4_features[] = {
[ETM4_IMPDEF_HISI_CORE_COMMIT] = {
.arch_callback = etm4_hisi_config_core_commit,
},
+ [ETM4_IMPDEF_HISI_SET_AUXCTRLR] = {
+ .arch_callback = etm4_hisi_config_auxctrlr,
+ },
{},
};
static void etm4_enable_arch_specific(struct etmv4_drvdata *drvdata)
{
struct etm4_arch_features *ftr;
+ bool enable = true;
int bit;
for_each_set_bit(bit, drvdata->arch_features, ETM4_IMPDEF_FEATURE_MAX) {
ftr = &etm4_features[bit];
- if (ftr->arch_callback)
- ftr->arch_callback(true);
+ if (bit == ETM4_IMPDEF_HISI_CORE_COMMIT && ftr->arch_callback)
+ ftr->arch_callback(&enable);
+
+ if (bit == ETM4_IMPDEF_HISI_SET_AUXCTRLR && ftr->arch_callback)
+ ftr->arch_callback(drvdata);
}
}
static void etm4_disable_arch_specific(struct etmv4_drvdata *drvdata)
{
struct etm4_arch_features *ftr;
+ bool enable = false;
int bit;
for_each_set_bit(bit, drvdata->arch_features, ETM4_IMPDEF_FEATURE_MAX) {
ftr = &etm4_features[bit];
- if (ftr->arch_callback)
- ftr->arch_callback(false);
+ if (bit == ETM4_IMPDEF_HISI_CORE_COMMIT && ftr->arch_callback)
+ ftr->arch_callback(&enable);
}
}
static void etm4_check_arch_features(struct etmv4_drvdata *drvdata,
unsigned int id)
{
- if (etm4_hisi_match_pid(id))
+ if (etm4_hisi_match_pid(id)) {
set_bit(ETM4_IMPDEF_HISI_CORE_COMMIT, drvdata->arch_features);
+ set_bit(ETM4_IMPDEF_HISI_SET_AUXCTRLR, drvdata->arch_features);
+ }
}
#else
static void etm4_enable_arch_specific(struct etmv4_drvdata *drvdata)
{
+ writel_relaxed(0x0, drvdata->base + TRCAUXCTLR);
}
static void etm4_disable_arch_specific(struct etmv4_drvdata *drvdata)
@@ -223,7 +245,6 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
writel_relaxed(config->pe_sel, drvdata->base + TRCPROCSELR);
writel_relaxed(config->cfg, drvdata->base + TRCCONFIGR);
/* nothing specific implemented */
- writel_relaxed(0x0, drvdata->base + TRCAUXCTLR);
writel_relaxed(config->eventctrl0, drvdata->base + TRCEVENTCTL0R);
writel_relaxed(config->eventctrl1, drvdata->base + TRCEVENTCTL1R);
if (drvdata->stallctl)
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h
index 3dd3e0633328..ef9d7365c2da 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -206,6 +206,7 @@
enum etm_impdef_type {
ETM4_IMPDEF_HISI_CORE_COMMIT,
+ ETM4_IMPDEF_HISI_SET_AUXCTRLR,
ETM4_IMPDEF_FEATURE_MAX,
};
--
2.20.1
1
5

09 Aug '22
From: Peter Zijlstra <peterz(a)infradead.org>
mainline inclusion
from mainline-v5.12-rc1
commit 87ccc826bf1c9e5ab4c2f649b404e02c63e47622
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5LCHG
CVE: NA
--------------------------------
Currently REG_SP_INDIRECT is unused but means (%rsp + offset),
change it to mean (%rsp) + offset.
The reason is that we're going to swizzle stack in the middle of a C
function with non-trivial stack footprint. This means that when the
unwinder finds the ToS, it needs to dereference it (%rsp) and then add
the offset to the next frame, resulting in: (%rsp) + offset
This is somewhat unfortunate, since REG_BP_INDIRECT is used (by DRAP)
and thus needs to retain the current (%rbp + offset).
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Miroslav Benes <mbenes(a)suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe(a)redhat.com>
Signed-off-by: Yipeng Zou <zouyipeng(a)huawei.com>
Reviewed-by: Zhang Jianhua <chris.zjh(a)huawei.com>
Reviewed-by: Liao Chang <liaochang1(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
arch/x86/kernel/unwind_orc.c | 5 ++++-
tools/objtool/orc_dump.c | 2 +-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index bafe953f5d7f..eea8ec5eca3b 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -450,7 +450,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_REG_SP_INDIRECT:
- sp = state->sp + orc->sp_offset;
+ sp = state->sp;
indirect = true;
break;
@@ -500,6 +500,9 @@ bool unwind_next_frame(struct unwind_state *state)
if (indirect) {
if (!deref_stack_reg(state, sp, &sp))
goto err;
+
+ if (orc->sp_reg == ORC_REG_SP_INDIRECT)
+ sp += orc->sp_offset;
}
/* Find IP, SP and possibly regs: */
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index faa444270ee3..ba28830aace2 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -64,7 +64,7 @@ static void print_reg(unsigned int reg, int offset)
if (reg == ORC_REG_BP_INDIRECT)
printf("(bp%+d)", offset);
else if (reg == ORC_REG_SP_INDIRECT)
- printf("(sp%+d)", offset);
+ printf("(sp)%+d", offset);
else if (reg == ORC_REG_UNDEFINED)
printf("(und)");
else
--
2.25.1
1
6

[PATCH openEuler-1.0-LTS] netfilter: nf_queue: do not allow packet truncation below transport header offset
by Yongqiang Liu 09 Aug '22
by Yongqiang Liu 09 Aug '22
09 Aug '22
From: Florian Westphal <fw(a)strlen.de>
mainline inclusion
from mainline-v5.19-rc3
commit 99a63d36cb3ed5ca3aa6fcb64cffbeaf3b0fb164
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5J9R4
CVE: CVE-2022-36946
--------------------------------
Domingo Dirutigliano and Nicola Guerrera report kernel panic when
sending nf_queue verdict with 1-byte nfta_payload attribute.
The IP/IPv6 stack pulls the IP(v6) header from the packet after the
input hook.
If user truncates the packet below the header size, this skb_pull() will
result in a malformed skb (skb->len < 0).
Fixes: 7af4cc3fa158 ("[NETFILTER]: Add "nfnetlink_queue" netfilter queue handler over nfnetlink")
Reported-by: Domingo Dirutigliano <pwnzer0tt1(a)proton.me>
Signed-off-by: Florian Westphal <fw(a)strlen.de>
Reviewed-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
Signed-off-by: Ziyang Xuan <william.xuanziyang(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
net/netfilter/nfnetlink_queue.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index a5aff2834bd6..cd496b074a71 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -850,11 +850,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
}
static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
struct sk_buff *nskb;
if (diff < 0) {
+ unsigned int min_len = skb_transport_offset(e->skb);
+
+ if (data_len < min_len)
+ return -EINVAL;
+
if (pskb_trim(e->skb, data_len))
return -ENOMEM;
} else if (diff > 0) {
--
2.25.1
1
0

[PATCH openEuler-1.0-LTS] openvswitch: fix OOB access in reserve_sfa_size()
by Yongqiang Liu 08 Aug '22
by Yongqiang Liu 08 Aug '22
08 Aug '22
From: Paolo Valerio <pvalerio(a)redhat.com>
stable inclusion
from stable-v4.19.240
commit bbbf059337f9a74285c1cf088ff85ee92d149e64
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5KKF2
CVE: CVE-2022-2639
-------------------------------------------------
commit cefa91b2332d7009bc0be5d951d6cbbf349f90f8 upstream.
Given a sufficiently large number of actions, while copying and
reserving memory for a new action of a new flow, if next_offset is
greater than MAX_ACTIONS_BUFSIZE, the function reserve_sfa_size() does
not return -EMSGSIZE as expected, but it allocates MAX_ACTIONS_BUFSIZE
bytes increasing actions_len by req_size. This can then lead to an OOB
write access, especially when further actions need to be copied.
Fix it by rearranging the flow action size check.
KASAN splat below:
==================================================================
BUG: KASAN: slab-out-of-bounds in reserve_sfa_size+0x1ba/0x380 [openvswitch]
Write of size 65360 at addr ffff888147e4001c by task handler15/836
CPU: 1 PID: 836 Comm: handler15 Not tainted 5.18.0-rc1+ #27
...
Call Trace:
<TASK>
dump_stack_lvl+0x45/0x5a
print_report.cold+0x5e/0x5db
? __lock_text_start+0x8/0x8
? reserve_sfa_size+0x1ba/0x380 [openvswitch]
kasan_report+0xb5/0x130
? reserve_sfa_size+0x1ba/0x380 [openvswitch]
kasan_check_range+0xf5/0x1d0
memcpy+0x39/0x60
reserve_sfa_size+0x1ba/0x380 [openvswitch]
__add_action+0x24/0x120 [openvswitch]
ovs_nla_add_action+0xe/0x20 [openvswitch]
ovs_ct_copy_action+0x29d/0x1130 [openvswitch]
? __kernel_text_address+0xe/0x30
? unwind_get_return_address+0x56/0xa0
? create_prof_cpu_mask+0x20/0x20
? ovs_ct_verify+0xf0/0xf0 [openvswitch]
? prep_compound_page+0x198/0x2a0
? __kasan_check_byte+0x10/0x40
? kasan_unpoison+0x40/0x70
? ksize+0x44/0x60
? reserve_sfa_size+0x75/0x380 [openvswitch]
__ovs_nla_copy_actions+0xc26/0x2070 [openvswitch]
? __zone_watermark_ok+0x420/0x420
? validate_set.constprop.0+0xc90/0xc90 [openvswitch]
? __alloc_pages+0x1a9/0x3e0
? __alloc_pages_slowpath.constprop.0+0x1da0/0x1da0
? unwind_next_frame+0x991/0x1e40
? __mod_node_page_state+0x99/0x120
? __mod_lruvec_page_state+0x2e3/0x470
? __kasan_kmalloc_large+0x90/0xe0
ovs_nla_copy_actions+0x1b4/0x2c0 [openvswitch]
ovs_flow_cmd_new+0x3cd/0xb10 [openvswitch]
...
Cc: stable(a)vger.kernel.org
Fixes: f28cd2af22a0 ("openvswitch: fix flow actions reallocation")
Signed-off-by: Paolo Valerio <pvalerio(a)redhat.com>
Acked-by: Eelco Chaudron <echaudro(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Lu Wei <luwei32(a)huawei.com>
Reviewed-by: Yue Haibing <yuehaibing(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
net/openvswitch/flow_netlink.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index eab5e8eaddaa..217c64fbc659 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2316,7 +2316,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
- if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
+ if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
OVS_NLERR(log, "Flow action size exceeds max %u",
MAX_ACTIONS_BUFSIZE);
return ERR_PTR(-EMSGSIZE);
--
2.25.1
1
0