Kernel
Threads by month
- ----- 2025 -----
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 61 participants
- 20118 discussions

[PATCH OLK-6.6 0/2] mm/smaps: fix race between smaps_hugetlb_range and migration
by Jinjiang Tu 26 Aug '25
by Jinjiang Tu 26 Aug '25
26 Aug '25
David Hildenbrand (1):
fs/proc/task_mmu: convert smaps_hugetlb_range() to work on folios
Jinjiang Tu (1):
mm/smaps: fix race between smaps_hugetlb_range and migration
fs/proc/task_mmu.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
--
2.43.0
2
3

26 Aug '25
hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ICS3XV
CVE: NA
--------------------------------
This introduces a kind of network optimization method named oenetcls. It
can configure the ntuple rule, and bind interrupt to the netdev queue
automatically.
Signed-off-by: Yue Haibing <yuehaibing(a)huawei.com>
Signed-off-by: Wang Liang <wangliang74(a)huawei.com>
Signed-off-by: Liu Jian <liujian56(a)huawei.com>
Signed-off-by: yuelg <yuelg(a)chinaunicom.cn>
---
arch/arm64/configs/openeuler_defconfig | 1 +
arch/x86/configs/openeuler_defconfig | 1 +
drivers/hooks/Kconfig | 10 +
drivers/hooks/vendor_hooks.c | 8 +
include/net/netdev_rx_queue.h | 2 +-
include/trace/hooks/oenetcls.h | 44 +
kernel/irq/irqdesc.c | 2 +-
net/Kconfig | 1 +
net/Makefile | 1 +
net/core/dev.c | 20 +
net/ipv4/af_inet.c | 4 +
net/ipv4/tcp.c | 7 +
net/oenetcls/Kconfig | 10 +
net/oenetcls/Makefile | 8 +
net/oenetcls/asmdefs.h | 61 ++
net/oenetcls/memcpy-sve.S | 157 ++++
net/oenetcls/oenetcls.h | 177 ++++
net/oenetcls/oenetcls_flow.c | 403 +++++++++
net/oenetcls/oenetcls_main.c | 1076 ++++++++++++++++++++++++
net/oenetcls/oenetcls_ntuple.c | 565 +++++++++++++
20 files changed, 2556 insertions(+), 2 deletions(-)
create mode 100644 include/trace/hooks/oenetcls.h
create mode 100644 net/oenetcls/Kconfig
create mode 100644 net/oenetcls/Makefile
create mode 100644 net/oenetcls/asmdefs.h
create mode 100644 net/oenetcls/memcpy-sve.S
create mode 100644 net/oenetcls/oenetcls.h
create mode 100644 net/oenetcls/oenetcls_flow.c
create mode 100644 net/oenetcls/oenetcls_main.c
create mode 100644 net/oenetcls/oenetcls_ntuple.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 6b2116f83cbf..504aa8ae996d 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -6944,6 +6944,7 @@ CONFIG_USB4=m
#
CONFIG_VENDOR_HOOKS=y
CONFIG_VENDOR_BOND_HOOKS=y
+# CONFIG_OENETCLS_HOOKS is not set
# end of Vendor Hooks
CONFIG_LIBNVDIMM=m
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index 52e6ccad8aa8..3100495149e2 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -8145,6 +8145,7 @@ CONFIG_USB4=m
#
CONFIG_VENDOR_HOOKS=y
CONFIG_VENDOR_BOND_HOOKS=y
+# CONFIG_OENETCLS_HOOKS is not set
# end of Vendor Hooks
CONFIG_LIBNVDIMM=m
diff --git a/drivers/hooks/Kconfig b/drivers/hooks/Kconfig
index 6a00168e67ad..90b0f6ea4040 100644
--- a/drivers/hooks/Kconfig
+++ b/drivers/hooks/Kconfig
@@ -20,4 +20,14 @@ config VENDOR_BOND_HOOKS
Allow vendor modules to attach bonding driver hooks defined via
DECLARE_HOOK or DECLARE_RESTRICTED_HOOK.
+config OENETCLS_HOOKS
+ bool "Oenetcls driver Hooks"
+ depends on VENDOR_HOOKS
+ default n
+ help
+ Enable oenetcls vendor hooks
+ Allow vendor modules to attach oenetcls hooks defined via
+ DECLARE_HOOK or DECLARE_RESTRICTED_HOOK.
+ Use OENETCLS && OENETCLS_HOOKS to enable oenetcls feature.
+
endmenu
diff --git a/drivers/hooks/vendor_hooks.c b/drivers/hooks/vendor_hooks.c
index 85bda58159f6..d9b85b57a742 100644
--- a/drivers/hooks/vendor_hooks.c
+++ b/drivers/hooks/vendor_hooks.c
@@ -9,6 +9,7 @@
#define CREATE_TRACE_POINTS
#include <trace/hooks/vendor_hooks.h>
#include <trace/hooks/bonding.h>
+#include <trace/hooks/oenetcls.h>
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -18,3 +19,10 @@
#ifdef CONFIG_VENDOR_BOND_HOOKS
EXPORT_TRACEPOINT_SYMBOL_GPL(vendor_bond_check_dev_link);
#endif
+
+#ifdef CONFIG_OENETCLS_HOOKS
+EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_flow_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_set_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_timeout);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ethtool_cfg_rxcls);
+#endif
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 377f43745abf..3fb5d8eb18fc 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -22,7 +22,7 @@ struct netdev_rx_queue {
struct xsk_buff_pool *pool;
#endif
- KABI_RESERVE(1)
+ KABI_USE(1, void *__rcu oecls_ftb)
KABI_RESERVE(2)
KABI_RESERVE(3)
KABI_RESERVE(4)
diff --git a/include/trace/hooks/oenetcls.h b/include/trace/hooks/oenetcls.h
new file mode 100644
index 000000000000..c38545d7a6a2
--- /dev/null
+++ b/include/trace/hooks/oenetcls.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * oenetcls driver Hooks
+ *
+ * Copyright (c) 2025, Huawei Tech. Co., Ltd.
+ */
+
+#ifdef CONFIG_OENETCLS_HOOKS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM oenetcls
+
+#define TRACE_INCLUDE_PATH trace/hooks
+#if !defined(_TRACE_OENETCLS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OENETCLS_H
+#include <linux/tracepoint.h>
+#include <trace/hooks/vendor_hooks.h>
+
+struct sock;
+struct sk_buff;
+struct net_device;
+
+DECLARE_HOOK(oecls_flow_update,
+TP_PROTO(struct sock *sk),
+TP_ARGS(sk));
+
+DECLARE_HOOK(oecls_set_cpu,
+TP_PROTO(struct sk_buff *skb),
+TP_ARGS(skb));
+
+DECLARE_HOOK(oecls_timeout,
+TP_PROTO(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id, bool *ret),
+TP_ARGS(dev, rxq_index, flow_id, filter_id, ret));
+
+DECLARE_HOOK(ethtool_cfg_rxcls,
+TP_PROTO(struct sock *sk, int is_del),
+TP_ARGS(sk, is_del));
+
+#endif
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+#endif
+
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 46094f0c9fcd..29f4101585cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -383,7 +383,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return mtree_load(&sparse_irqs, irq);
}
-#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
+#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_OENETCLS)
EXPORT_SYMBOL_GPL(irq_to_desc);
#endif
diff --git a/net/Kconfig b/net/Kconfig
index 2fc1860faeb4..bea9c2529bb1 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -72,6 +72,7 @@ source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
source "net/smc/Kconfig"
source "net/xdp/Kconfig"
+source "net/oenetcls/Kconfig"
config NET_HANDSHAKE
bool
diff --git a/net/Makefile b/net/Makefile
index 4c4dc535453d..4ffee8a3c427 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -79,4 +79,5 @@ obj-$(CONFIG_NET_NCSI) += ncsi/
obj-$(CONFIG_XDP_SOCKETS) += xdp/
obj-$(CONFIG_MPTCP) += mptcp/
obj-$(CONFIG_MCTP) += mctp/
+obj-$(CONFIG_OENETCLS) += oenetcls/
obj-$(CONFIG_NET_HANDSHAKE) += handshake/
diff --git a/net/core/dev.c b/net/core/dev.c
index cbb4bd4718cd..a0624c801a35 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -154,6 +154,7 @@
#include <linux/once_lite.h>
#include <net/netdev_rx_queue.h>
#include <linux/if_caqm.h>
+#include <trace/hooks/oenetcls.h>
#include "dev.h"
#include "net-sysfs.h"
@@ -4727,6 +4728,11 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
bool expire = true;
unsigned int cpu;
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_oecls_timeout(dev, rxq_index, flow_id, filter_id, &expire);
+ if (expire)
+ return true;
+#endif
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
@@ -5814,6 +5820,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
}
}
#endif
+
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_oecls_set_cpu(skb);
+#endif
+
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
@@ -5848,6 +5859,12 @@ void netif_receive_skb_list_internal(struct list_head *head)
}
}
#endif
+
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ list_for_each_entry_safe(skb, next, head, list)
+ trace_oecls_set_cpu(skb);
+#endif
+
__netif_receive_skb_list(head);
rcu_read_unlock();
}
@@ -9960,6 +9977,9 @@ int __netdev_update_features(struct net_device *dev)
return err < 0 ? 0 : 1;
}
+#if IS_ENABLED(CONFIG_OENETCLS)
+EXPORT_SYMBOL(__netdev_update_features);
+#endif
/**
* netdev_update_features - recalculate device features
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f336b2ddf972..ee224b196666 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -121,6 +121,7 @@
#include <net/compat.h>
#include <trace/events/sock.h>
+#include <trace/hooks/oenetcls.h>
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
@@ -219,6 +220,9 @@ int __inet_listen_sk(struct sock *sk, int backlog)
return err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_ethtool_cfg_rxcls(sk, 0);
+#endif
}
return 0;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c2419903f0e4..3e45b736aa10 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
+#include <trace/hooks/oenetcls.h>
/* Track pending CMSGs. */
enum {
@@ -2577,6 +2578,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_oecls_flow_update(sk);
+#endif
if (sk_can_busy_loop(sk) &&
skb_queue_empty_lockless(&sk->sk_receive_queue) &&
sk->sk_state == TCP_ESTABLISHED)
@@ -2940,6 +2944,9 @@ void __tcp_close(struct sock *sk, long timeout)
void tcp_close(struct sock *sk, long timeout)
{
lock_sock(sk);
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_ethtool_cfg_rxcls(sk, 1);
+#endif
__tcp_close(sk, timeout);
release_sock(sk);
if (!sk->sk_net_refcnt)
diff --git a/net/oenetcls/Kconfig b/net/oenetcls/Kconfig
new file mode 100644
index 000000000000..2ab980258c31
--- /dev/null
+++ b/net/oenetcls/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config OENETCLS
+ tristate "Network classification"
+ depends on OENETCLS_HOOKS
+ default n
+ help
+ Allows to configure ntuple rule, and bind interrupt to netdev
+ automatically.
+ Use OENETCLS && OENETCLS_HOOKS to enable oenetcls feature.
+ Use parameter mode to decide running mode.
diff --git a/net/oenetcls/Makefile b/net/oenetcls/Makefile
new file mode 100644
index 000000000000..cdf17ea096d3
--- /dev/null
+++ b/net/oenetcls/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_OENETCLS) = oenetcls.o
+oenetcls-y := oenetcls_main.o oenetcls_ntuple.o oenetcls_flow.o
+ifeq ($(CONFIG_ARM64_SVE),y)
+oenetcls-y += memcpy-sve.o
+endif
+
diff --git a/net/oenetcls/asmdefs.h b/net/oenetcls/asmdefs.h
new file mode 100644
index 000000000000..8138a94c18af
--- /dev/null
+++ b/net/oenetcls/asmdefs.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files. */
+GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name, %function; \
+ .align alignment; \
+name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name, %function; \
+ name:
+
+#define END(name) \
+ .cfi_endproc; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
diff --git a/net/oenetcls/memcpy-sve.S b/net/oenetcls/memcpy-sve.S
new file mode 100644
index 000000000000..106e4c30294c
--- /dev/null
+++ b/net/oenetcls/memcpy-sve.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include "asmdefs.h"
+
+.arch armv8-a+sve
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp1 x6
+#define vlen x6
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+ SVE vectors are used to speedup small copies.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+ cmp count, 128
+ b.hi L(copy_long)
+ cntb vlen
+ cmp count, vlen, lsl 1
+ b.hi L(copy32_128)
+
+ whilelo p0.b, xzr, count
+ whilelo p1.b, vlen, count
+ ld1b z0.b, p0/z, [src, 0, mul vl]
+ ld1b z1.b, p1/z, [src, 1, mul vl]
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z1.b, p1, [dstin, 1, mul vl]
+ ret
+
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ add srcend, src, count
+ add dstend, dstin, count
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
+ ret
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ cbz tmp1, L(return)
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(return):
+ ret
+
+END (__memcpy_aarch64_sve)
diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h
new file mode 100644
index 000000000000..215ae3e7e153
--- /dev/null
+++ b/net/oenetcls/oenetcls.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _NET_OENETCLS_H
+#define _NET_OENETCLS_H
+#include <linux/if.h>
+#include <linux/mutex.h>
+#include <linux/cpufeature.h>
+
+#define OECLS_MAX_NETDEV_NUM 8
+#define OECLS_MAX_RXQ_NUM_PER_DEV 256
+#define OECLS_MAX_CPU_NUM 1024
+
+#define OECLS_TIMEOUT (5 * HZ)
+#define OECLS_NO_FILTER 0xffff
+#define OECLS_NO_CPU 0xffff
+
+struct oecls_netdev_queue_info {
+ int irq;
+ int affinity_cpu;
+};
+
+struct oecls_netdev_info {
+ char dev_name[IFNAMSIZ];
+ struct net_device *netdev;
+ int rxq_num;
+ struct oecls_netdev_queue_info rxq[OECLS_MAX_RXQ_NUM_PER_DEV];
+ int old_filter_state;
+};
+
+struct oecls_rxq {
+ int rxq_id;
+ int status;
+};
+
+struct oecls_numa_clusterinfo {
+ int cluster_id;
+ int cur_freeidx;
+ struct oecls_rxq rxqs[OECLS_MAX_RXQ_NUM_PER_DEV];
+};
+
+struct oecls_numa_bound_dev_info {
+ DECLARE_BITMAP(bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ struct oecls_numa_clusterinfo *cluster_info;
+};
+
+struct oecls_numa_info {
+ DECLARE_BITMAP(avail_cpus, OECLS_MAX_CPU_NUM);
+ struct oecls_numa_bound_dev_info bound_dev[OECLS_MAX_NETDEV_NUM];
+};
+
+struct cmd_context {
+ char netdev[IFNAMSIZ];
+ u32 dip4;
+ u16 dport;
+ u16 action;
+ u32 ruleid;
+ u32 del_ruleid;
+ int ret_loc;
+};
+
+#define OECLS_SK_RULE_HASHSIZE 256
+#define OECLS_SK_RULE_HASHMASK (OECLS_SK_RULE_HASHSIZE - 1)
+
+struct oecls_sk_rule_list {
+ struct hlist_head hash[OECLS_SK_RULE_HASHSIZE];
+ /* Mutex to synchronize access to ntuple rule locking */
+ struct mutex mutex;
+};
+
+struct oecls_sk_rule {
+ struct hlist_node node;
+ int devid;
+ void *sk;
+ int dip4;
+ int dport;
+ int action;
+ int ruleid;
+ int nid;
+};
+
+struct oecls_sk_entry {
+ struct hlist_node node;
+ void *sk;
+ u32 sk_rule_hash;
+};
+
+struct oecls_dev_flow {
+ unsigned short cpu;
+ unsigned short filter;
+ unsigned int last_qtail;
+ int isvalid;
+ unsigned long timeout;
+};
+
+struct oecls_dev_flow_table {
+ unsigned int mask;
+ struct rcu_head rcu;
+ struct oecls_dev_flow flows[];
+};
+
+struct oecls_sock_flow_table {
+ u32 mask;
+ u32 ents[] ____cacheline_aligned_in_smp;
+};
+
+#define OECLS_DEV_FLOW_TABLE_NUM 0x1000
+#define OECLS_SOCK_FLOW_TABLE_NUM 0x100000
+#define OECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct oecls_dev_flow_table) + \
+ ((_num) * sizeof(struct oecls_dev_flow)))
+#define OECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct oecls_sock_flow_table, ents[_num]))
+
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+ NETIF_F_RXHASH)
+
+struct rmgr_ctrl {
+ int driver_select;
+ unsigned long *slot;
+ __u32 n_rules;
+ __u32 size;
+};
+
+extern int match_ip_flag;
+extern int debug;
+extern int oecls_netdev_num;
+extern int oecls_numa_num;
+
+#define oecls_debug(fmt, ...) \
+ do { \
+ if (debug) \
+ trace_printk(fmt, ## __VA_ARGS__); \
+ } while (0)
+
+#define oecls_error(fmt, ...) \
+ do { \
+ pr_err("oenetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \
+ trace_printk(fmt, ## __VA_ARGS__); \
+ } while (0)
+
+struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index);
+
+#define for_each_oecls_netdev(devid, oecls_dev) \
+ for (devid = 0, oecls_dev = get_oecls_netdev_info(devid); \
+ (devid < oecls_netdev_num) && oecls_dev; \
+ devid++, oecls_dev = get_oecls_netdev_info(devid))
+
+struct oecls_numa_info *get_oecls_numa_info(unsigned int nid);
+
+#define for_each_oecls_numa(nid, numa_info) \
+ for (nid = 0, numa_info = get_oecls_numa_info(nid); \
+ (nid < oecls_numa_num) && numa_info; \
+ nid++, numa_info = get_oecls_numa_info(nid))
+
+#ifdef CONFIG_ARM64_SVE
+void *__memcpy_aarch64_sve(void *, const void *, size_t);
+#define memcpy_r(dst, src, len) \
+ do { \
+ if (system_supports_sve()) \
+ __memcpy_aarch64_sve(dst, src, len); \
+ else \
+ memcpy(dst, src, len); \
+ } while (0)
+#else
+#define memcpy_r(dst, src, len) memcpy(dst, src, len)
+#endif
+
+int check_appname(char *task_name);
+int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd);
+int alloc_rxq_id(int nid, int devid);
+void free_rxq_id(int nid, int devid, int rxq_id);
+void oecls_ntuple_res_init(void);
+void oecls_ntuple_res_clean(void);
+void oecls_flow_res_init(void);
+void oecls_flow_res_clean(void);
+
+#endif /* _NET_OENETCLS_H */
diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c
new file mode 100644
index 000000000000..5dc58e8bae25
--- /dev/null
+++ b/net/oenetcls/oenetcls_flow.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/inet.h>
+#include <net/netdev_rx_queue.h>
+#include <net/sock.h>
+#include <trace/hooks/oenetcls.h>
+#include "oenetcls.h"
+
+static u32 oecls_cpu_mask;
+static struct oecls_sock_flow_table __rcu *oecls_sock_flow_table;
+static DEFINE_MUTEX(oecls_sock_flow_mutex);
+static DEFINE_SPINLOCK(oecls_dev_flow_lock);
+
+bool is_oecls_config_netdev(const char *name)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info)
+ if (strcmp(netdev_info->dev_name, name) == 0)
+ return true;
+
+ return false;
+}
+
+static void oecls_timeout(void *data, struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id, bool *ret)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct oecls_dev_flow_table *flow_table;
+ struct oecls_dev_flow *rflow;
+ bool expire = true;
+ unsigned int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->oecls_ftb);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = READ_ONCE(rflow->cpu);
+ oecls_debug("dev:%s, rxq:%d, flow_id:%u, filter_id:%d/%d, cpu:%d\n", dev->name,
+ rxq_index, flow_id, filter_id, rflow->filter, cpu);
+
+ if (rflow->filter == filter_id && cpu < nr_cpu_ids) {
+ if (time_before(jiffies, rflow->timeout + OECLS_TIMEOUT)) {
+ expire = false;
+ } else {
+ rflow->isvalid = 0;
+ WRITE_ONCE(rflow->cpu, OECLS_NO_CPU);
+ }
+ }
+ }
+ rcu_read_unlock();
+ oecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__,
+ dev->name, rxq_index, flow_id, filter_id, expire);
+ *ret = expire;
+}
+
+static void oecls_flow_update(void *data, struct sock *sk)
+{
+ struct oecls_sock_flow_table *tb;
+ unsigned int hash, index;
+ u32 val;
+ u32 cpu = raw_smp_processor_id();
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return;
+
+ if (check_appname(current->comm))
+ return;
+
+ rcu_read_lock();
+ tb = rcu_dereference(oecls_sock_flow_table);
+ hash = READ_ONCE(sk->sk_rxhash);
+ if (tb && hash) {
+ index = hash & tb->mask;
+ val = hash & ~oecls_cpu_mask;
+ val |= cpu;
+
+ if (READ_ONCE(tb->ents[index]) != val) {
+ WRITE_ONCE(tb->ents[index], val);
+
+ oecls_debug("[%s] sk:%p, hash:0x%x, index:0x%x, val:0x%x, cpu:%d\n",
+ current->comm, sk, hash, index, val, cpu);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+ u32 hash, index;
+ struct oecls_numa_info *numa_info;
+ struct oecls_numa_bound_dev_info *bound_dev = NULL;
+ int rxq_id, rxq_num, i;
+
+ numa_info = get_oecls_numa_info(nid);
+ if (!numa_info)
+ return -1;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info) {
+ if (strcmp(netdev_info->dev_name, dev->name) == 0) {
+ bound_dev = &numa_info->bound_dev[netdev_loop];
+ break;
+ }
+ }
+
+ if (!bound_dev)
+ return -1;
+ rxq_num = bitmap_weight(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ if (rxq_num == 0)
+ return -1;
+
+ hash = skb_get_hash(skb);
+ index = hash % rxq_num;
+
+ i = 0;
+ for_each_set_bit(rxq_id, bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV)
+ if (index == i++)
+ return rxq_id;
+
+ return -1;
+}
+
+static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct oecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu)
+{
+ struct netdev_rx_queue *rxqueue;
+ struct oecls_dev_flow_table *dtb;
+ struct oecls_dev_flow *rflow;
+ u32 flow_id, hash;
+ u16 rxq_index;
+ int rc;
+
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ return;
+
+ rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb);
+ if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0)
+ return;
+
+ rxqueue = dev->_rx + rxq_index;
+ dtb = rcu_dereference(rxqueue->oecls_ftb);
+ if (!dtb)
+ return;
+
+ hash = skb_get_hash(skb);
+ flow_id = hash & dtb->mask;
+ rflow = &dtb->flows[flow_id];
+ if (rflow->isvalid && rflow->cpu == next_cpu) {
+ rflow->timeout = jiffies;
+ return;
+ }
+
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id);
+ oecls_debug("skb:%p, rxq:%d, hash:0x%x, flow_id:%u, old_rxq_id:%d, next_cpu:%d, rc:%d\n",
+ skb, rxq_index, hash, flow_id, old_rxq_id, next_cpu, rc);
+ if (rc < 0)
+ return;
+
+ rflow->filter = rc;
+ rflow->isvalid = 1;
+ rflow->timeout = jiffies;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = OECLS_NO_FILTER;
+ rflow->cpu = next_cpu;
+}
+
+static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev,
+ struct oecls_sock_flow_table *tb, struct oecls_dev_flow_table *dtb,
+ int old_rxq_id)
+{
+ struct oecls_dev_flow *rflow;
+ u32 last_recv_cpu, hash, val;
+ u32 tcpu = 0;
+ u32 cpu = raw_smp_processor_id();
+
+ skb_reset_network_header(skb);
+ hash = skb_get_hash(skb);
+ if (!hash)
+ return;
+
+ val = READ_ONCE(tb->ents[hash & tb->mask]);
+ last_recv_cpu = val & oecls_cpu_mask;
+ rflow = &dtb->flows[hash & dtb->mask];
+ tcpu = rflow->cpu;
+
+ if ((val ^ hash) & ~oecls_cpu_mask)
+ return;
+
+ if (cpu_to_node(cpu) == cpu_to_node(last_recv_cpu))
+ return;
+
+ if (tcpu >= nr_cpu_ids)
+ set_oecls_cpu(ndev, skb, rflow, old_rxq_id, last_recv_cpu);
+}
+
+static void oecls_set_cpu(void *data, struct sk_buff *skb)
+{
+ struct net_device *ndev = skb->dev;
+ struct oecls_sock_flow_table *stb;
+ struct oecls_dev_flow_table *dtb;
+ struct netdev_rx_queue *rxqueue;
+ int rxq_id = -1;
+
+ if (!ndev)
+ return;
+
+ if (!is_oecls_config_netdev(ndev->name))
+ return;
+
+ rxqueue = ndev->_rx;
+ if (skb_rx_queue_recorded(skb)) {
+ rxq_id = skb_get_rx_queue(skb);
+ if (rxq_id >= ndev->real_num_rx_queues) {
+ oecls_debug("ndev:%s, rxq:%d, real_num:%d\n", ndev->name,
+ rxq_id, ndev->real_num_rx_queues);
+ return;
+ }
+ rxqueue += rxq_id;
+ }
+
+ // oecls_debug("skb:%px, dev:%s, rxq_id:%d\n", skb, ndev->name, rxq_id);
+ if (rxq_id < 0)
+ return;
+
+ rcu_read_lock();
+ stb = rcu_dereference(oecls_sock_flow_table);
+ dtb = rcu_dereference(rxqueue->oecls_ftb);
+ if (stb && dtb)
+ __oecls_set_cpu(skb, ndev, stb, dtb, rxq_id);
+
+ rcu_read_unlock();
+}
+
+static void oecls_dev_flow_table_free(struct rcu_head *rcu)
+{
+ struct oecls_dev_flow_table *table = container_of(rcu,
+ struct oecls_dev_flow_table, rcu);
+ vfree(table);
+}
+
+static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int qid)
+{
+ struct oecls_dev_flow_table *dtb;
+ struct netdev_rx_queue *queue;
+ int i;
+
+ spin_lock(&oecls_dev_flow_lock);
+ for (i = 0; i < qid; i++) {
+ queue = netdev->_rx + i;
+ dtb = rcu_dereference_protected(queue->oecls_ftb,
+ lockdep_is_held(&oecls_dev_flow_lock));
+ rcu_assign_pointer(queue->oecls_ftb, NULL);
+ }
+ spin_unlock(&oecls_dev_flow_lock);
+ call_rcu(&dtb->rcu, oecls_dev_flow_table_free);
+}
+
+static int oecls_dev_flow_table_release(void)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+ struct net_device *netdev;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info) {
+ netdev = netdev_info->netdev;
+ if (!netdev)
+ continue;
+ oecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues);
+ }
+
+ return 0;
+}
+
+static int _oecls_dev_flow_table_init(struct net_device *netdev)
+{
+ struct oecls_dev_flow_table *table;
+ int size = OECLS_DEV_FLOW_TABLE_NUM;
+ struct netdev_rx_queue *queue;
+ int i, j, ret = 0;
+
+ size = roundup_pow_of_two(size);
+ oecls_debug("dev:%s, num_rx_queues:%d, mask:0x%x\n", netdev->name, netdev->num_rx_queues,
+ size - 1);
+
+ for (i = 0; i < netdev->num_rx_queues; i++) {
+ table = vmalloc(OECLS_DEV_FLOW_TABLE_SIZE(size));
+ if (!table) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ table->mask = size - 1;
+ for (j = 0; j < size; j++) {
+ table->flows[j].cpu = OECLS_NO_CPU;
+ table->flows[j].isvalid = 0;
+ }
+
+ queue = netdev->_rx + i;
+
+ spin_lock(&oecls_dev_flow_lock);
+ rcu_assign_pointer(queue->oecls_ftb, table);
+ spin_unlock(&oecls_dev_flow_lock);
+ }
+ return ret;
+fail:
+ oecls_dev_flow_table_cleanup(netdev, i);
+ return ret;
+}
+
+static int oecls_dev_flow_table_init(void)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+ struct net_device *ndev;
+ int i, err;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info) {
+ ndev = netdev_info->netdev;
+ if (!ndev)
+ continue;
+ err = _oecls_dev_flow_table_init(ndev);
+ if (err)
+ goto out;
+ }
+
+ return 0;
+out:
+ for (i = 0; i < netdev_loop; i++) {
+ netdev_info = get_oecls_netdev_info(i);
+ ndev = netdev_info->netdev;
+ if (!ndev)
+ continue;
+ oecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues);
+ }
+ return err;
+}
+
+static int oecls_sock_flow_table_release(void)
+{
+ struct oecls_sock_flow_table *tb;
+
+ mutex_lock(&oecls_sock_flow_mutex);
+ tb = rcu_dereference_protected(oecls_sock_flow_table,
+ lockdep_is_held(&oecls_sock_flow_mutex));
+ if (tb)
+ rcu_assign_pointer(oecls_sock_flow_table, NULL);
+ mutex_unlock(&oecls_sock_flow_mutex);
+ synchronize_rcu();
+ vfree(tb);
+
+ unregister_trace_oecls_flow_update(&oecls_flow_update, NULL);
+ unregister_trace_oecls_set_cpu(&oecls_set_cpu, NULL);
+ unregister_trace_oecls_timeout(&oecls_timeout, NULL);
+ return 0;
+}
+
+static int oecls_sock_flow_table_init(void)
+{
+ struct oecls_sock_flow_table *table;
+ int size = OECLS_SOCK_FLOW_TABLE_NUM;
+ int i;
+
+ size = roundup_pow_of_two(size);
+ table = vmalloc(OECLS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!table)
+ return -ENOMEM;
+
+ oecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
+ oecls_debug("nr_cpu_ids:%d, oecls_cpu_mask:0x%x\n", nr_cpu_ids, oecls_cpu_mask);
+
+ table->mask = size - 1;
+ for (i = 0; i < size; i++)
+ table->ents[i] = OECLS_NO_CPU;
+
+ mutex_lock(&oecls_sock_flow_mutex);
+ rcu_assign_pointer(oecls_sock_flow_table, table);
+ mutex_unlock(&oecls_sock_flow_mutex);
+
+ register_trace_oecls_flow_update(oecls_flow_update, NULL);
+ register_trace_oecls_set_cpu(&oecls_set_cpu, NULL);
+ register_trace_oecls_timeout(&oecls_timeout, NULL);
+ return 0;
+}
+
+void oecls_flow_res_init(void)
+{
+ oecls_sock_flow_table_init();
+ oecls_dev_flow_table_init();
+}
+
+void oecls_flow_res_clean(void)
+{
+ oecls_sock_flow_table_release();
+ oecls_dev_flow_table_release();
+}
diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c
new file mode 100644
index 000000000000..67c73f4595be
--- /dev/null
+++ b/net/oenetcls/oenetcls_main.c
@@ -0,0 +1,1076 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netdev_features.h>
+#include <linux/ethtool.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/rtnetlink.h>
+#include "oenetcls.h"
+
+int oecls_netdev_num;
+static struct oecls_netdev_info oecls_netdev_info_table[OECLS_MAX_NETDEV_NUM];
+
+int oecls_numa_num;
+static int oecls_cluster_cpu_num, oecls_cluster_per_numa;
+static struct oecls_numa_info *oecls_numa_info_table;
+
+int debug;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "debug switch");
+
+static int mode;
+module_param(mode, int, 0444);
+MODULE_PARM_DESC(mode, "mode, default 0");
+
+static char ifname[64] = { 0 };
+module_param_string(ifname, ifname, sizeof(ifname), 0444);
+MODULE_PARM_DESC(ifname, "ifname");
+
+static char appname[64] = "redis-server";
+module_param_string(appname, appname, sizeof(appname), 0644);
+MODULE_PARM_DESC(appname, "appname, default redis-server");
+
+int match_ip_flag = 1;
+module_param(match_ip_flag, int, 0644);
+MODULE_PARM_DESC(match_ip_flag, "match ip flag");
+
+static int strategy;
+module_param(strategy, int, 0444);
+MODULE_PARM_DESC(strategy, "strategy, default 0");
+
+static bool check_params(void)
+{
+ if (mode != 0 && mode != 1)
+ return false;
+
+ if (strlen(ifname) == 0)
+ return false;
+
+ return true;
+}
+
+int check_appname(char *task_name)
+{
+ char *start = appname, *end;
+
+ if (!strlen(appname))
+ return 0;
+
+ // support appname: app1#app2#appN
+ while (*start != '\0') {
+ end = strchr(start, '#');
+ if (end == start) {
+ start++;
+ continue;
+ }
+
+ if (!end) {
+ if (!strncmp(task_name, start, strlen(start)))
+ return 0;
+ break;
+ }
+
+ if (!strncmp(task_name, start, end - start))
+ return 0;
+ start = end + 1;
+ }
+ return -EOPNOTSUPP;
+}
+
+static u32 __ethtool_get_flags(struct net_device *dev)
+{
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO)
+ flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+ flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+ flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE)
+ flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH)
+ flags |= ETH_FLAG_RXHASH;
+
+ return flags;
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
+ return -EINVAL;
+
+ if (data & ETH_FLAG_LRO)
+ features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_RX;
+ if (data & ETH_FLAG_TXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_TX;
+ if (data & ETH_FLAG_NTUPLE)
+ features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH)
+ features |= NETIF_F_RXHASH;
+
+ /* allow changing only bits set in hw_features */
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | (features & changed);
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+static void ethtool_rxnfc_copy_to_user(void *useraddr,
+ const struct ethtool_rxnfc *rxnfc,
+ size_t size, const u32 *rule_buf)
+{
+ memcpy_r(useraddr, rxnfc, size);
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+
+ if (rule_buf)
+ memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32));
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+ u32 cmd, void *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!dev->ethtool_ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ if (cmd == ETHTOOL_SRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ memcpy_r(&info, useraddr, info_size);
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS)
+ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+ u32 cmd, void *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ void *rule_buf = NULL;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ if (cmd == ETHTOOL_GRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ memcpy_r(&info, useraddr, info_size);
+
+ /* If FLOW_RSS was requested then user-space must be using the
+ * new definition, as FLOW_RSS is newer.
+ */
+ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
+ info_size = sizeof(info);
+ memcpy_r(&info, useraddr, info_size);
+ /* Since malicious users may modify the original data,
+ * we need to check whether FLOW_RSS is still requested.
+ */
+ if (!(info.flow_type & FLOW_RSS))
+ return -EINVAL;
+ }
+
+ if (info.cmd != cmd)
+ return -EINVAL;
+
+ if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+ if (info.rule_cnt > 0) {
+ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
+ rule_buf = kcalloc(info.rule_cnt, sizeof(u32),
+ GFP_KERNEL);
+ if (!rule_buf)
+ return -ENOMEM;
+ }
+ }
+
+ ret = ops->get_rxnfc(dev, &info, rule_buf);
+ if (ret < 0)
+ goto err_out;
+
+ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf);
+err_out:
+ kfree(rule_buf);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+ void *useraddr)
+{
+ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+
+ memcpy_r(useraddr, &channels, sizeof(channels));
+ return 0;
+}
+
+static int ethtool_get_value(struct net_device *dev, char *useraddr,
+ u32 cmd, u32 (*actor)(struct net_device *))
+{
+ struct ethtool_value edata = { .cmd = cmd };
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ edata.data = actor(dev);
+
+ memcpy_r(useraddr, &edata, sizeof(edata));
+ return 0;
+}
+
+static int ethtool_set_value(struct net_device *dev, char *useraddr,
+ int (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ memcpy_r(&edata, useraddr, sizeof(edata));
+
+ return actor(dev, edata.data);
+}
+
+static int dev_ethtool_kern(struct net *net, struct ifreq *ifr)
+{
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ void *useraddr = ifr->ifr_data;
+ u32 ethcmd, sub_cmd;
+ int rc;
+ netdev_features_t old_features;
+
+ if (!dev || !netif_device_present(dev))
+ return -ENODEV;
+
+ memcpy_r(ðcmd, useraddr, sizeof(ethcmd));
+
+ if (ethcmd == ETHTOOL_PERQUEUE)
+ memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd));
+ else
+ sub_cmd = ethcmd;
+
+ /* Allow some commands to be done by anyone */
+ switch (sub_cmd) {
+ case ETHTOOL_GFLAGS:
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GCHANNELS:
+ break;
+ default:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ }
+
+ if (dev->ethtool_ops->begin) {
+ rc = dev->ethtool_ops->begin(dev);
+ if (rc < 0)
+ return rc;
+ }
+ old_features = dev->features;
+
+ switch (ethcmd) {
+ case ETHTOOL_GFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ __ethtool_get_flags);
+ break;
+ case ETHTOOL_SFLAGS:
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
+ break;
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ case ETHTOOL_SRXCLSRLDEL:
+ case ETHTOOL_SRXCLSRLINS:
+ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_GCHANNELS:
+ rc = ethtool_get_channels(dev, useraddr);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (old_features != dev->features)
+ netdev_features_change(dev);
+
+ return rc;
+}
+
+int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd)
+{
+ struct ifreq ifr = {0};
+ int ret;
+
+ strncpy(ifr.ifr_name, ctx->netdev, sizeof(ctx->netdev));
+ ifr.ifr_data = cmd;
+
+ rtnl_lock();
+ ret = dev_ethtool_kern(&init_net, &ifr);
+ rtnl_unlock();
+
+ return ret;
+}
+
+struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index)
+{
+ if (index >= OECLS_MAX_NETDEV_NUM)
+ return NULL;
+ return &oecls_netdev_info_table[index];
+}
+
+static struct oecls_netdev_info *alloc_oecls_netdev_info(void)
+{
+ if (oecls_netdev_num >= OECLS_MAX_NETDEV_NUM)
+ return NULL;
+
+ return &oecls_netdev_info_table[oecls_netdev_num++];
+}
+
+static bool check_irq_name(const char *irq_name, struct oecls_netdev_info *oecls_dev)
+{
+ if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx"))
+ return false;
+
+ if (strstr(irq_name, oecls_dev->dev_name))
+ return true;
+
+ if (oecls_dev->netdev->dev.parent &&
+ strstr(irq_name, dev_name(oecls_dev->netdev->dev.parent)))
+ return true;
+
+ return false;
+}
+
+static void get_netdev_queue_info(struct oecls_netdev_info *oecls_dev)
+{
+ struct oecls_netdev_queue_info *rxq_info;
+ struct irq_desc *desc;
+ int irq, cpu;
+
+ for_each_irq_desc(irq, desc) {
+ if (!desc->action)
+ continue;
+ if (!desc->action->name)
+ continue;
+ if (!check_irq_name(desc->action->name, oecls_dev))
+ continue;
+ if (oecls_dev->rxq_num >= OECLS_MAX_RXQ_NUM_PER_DEV)
+ break;
+ rxq_info = &oecls_dev->rxq[oecls_dev->rxq_num++];
+ rxq_info->irq = irq;
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data));
+ rxq_info->affinity_cpu = cpu;
+ oecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n",
+ irq, desc->action->name, oecls_dev->rxq_num, cpu);
+ }
+}
+
+static int oecls_filter_enable(const char *dev_name, bool *old_state)
+{
+ struct ethtool_value eval = {0};
+ struct cmd_context ctx = {0};
+ int ret;
+
+ strncpy(ctx.netdev, dev_name, IFNAMSIZ);
+
+ eval.cmd = ETHTOOL_GFLAGS;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
+ return ret;
+ }
+ if (eval.data & ETH_FLAG_NTUPLE) {
+ *old_state = true;
+ oecls_debug("%s ntuple is already on\n", dev_name);
+ return 0;
+ }
+
+ // Set ntuple feature
+ eval.cmd = ETHTOOL_SFLAGS;
+ eval.data |= ETH_FLAG_NTUPLE;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("set %s flags fail, ret:%d\n", dev_name, ret);
+ return ret;
+ }
+
+ // Get ntuple feature
+ eval.cmd = ETHTOOL_GFLAGS;
+ eval.data = 0;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
+ return ret;
+ }
+ if (!(eval.data & ETH_FLAG_NTUPLE)) {
+ oecls_error("enable ntuple feature fail!\n");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void oecls_filter_restore(const char *dev_name, bool old_state)
+{
+ struct ethtool_value eval = {0};
+ struct cmd_context ctx = {0};
+ bool cur_filter_state;
+ int ret;
+
+ strncpy(ctx.netdev, dev_name, IFNAMSIZ);
+
+ eval.cmd = ETHTOOL_GFLAGS;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
+ return;
+ }
+
+ cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false;
+ if (cur_filter_state == old_state)
+ return;
+
+ // Set ntuple feature
+ eval.cmd = ETHTOOL_SFLAGS;
+ if (old_state)
+ eval.data |= ETH_FLAG_NTUPLE;
+ else
+ eval.data &= ~ETH_FLAG_NTUPLE;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("set %s flags fail, ret:%d\n", dev_name, ret);
+ return;
+ }
+}
+
+static int init_single_oecls_dev(char *if_name, unsigned int length)
+{
+ struct oecls_netdev_info *oecls_dev;
+ char dev_name[IFNAMSIZ] = { 0 };
+ struct net_device *netdev;
+ int cpy_len = length < IFNAMSIZ ? length : IFNAMSIZ;
+ bool old_state = false;
+ int ret;
+
+ strncpy(dev_name, if_name, cpy_len);
+ netdev = dev_get_by_name(&init_net, dev_name);
+ if (!netdev) {
+ oecls_error("dev [%s] is not exist!\n", dev_name);
+ return -ENODEV;
+ }
+
+ if (!(netdev->flags & IFF_UP)) {
+ ret = -ENETDOWN;
+ oecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags);
+ goto out;
+ }
+
+ if (netdev->flags & IFF_LOOPBACK) {
+ ret = -EOPNOTSUPP;
+ oecls_error("Do not support loopback.\n");
+ goto out;
+ }
+
+ ret = oecls_filter_enable(dev_name, &old_state);
+ if (ret) {
+ oecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret);
+ goto out;
+ }
+
+ oecls_dev = alloc_oecls_netdev_info();
+ if (!oecls_dev) {
+ ret = -ENOMEM;
+ oecls_filter_restore(dev_name, old_state);
+ oecls_error("alloc oecls_dev fail! oecls_netdev_num:%d\n", oecls_netdev_num);
+ goto out;
+ }
+
+ memcpy_r(oecls_dev->dev_name, dev_name, IFNAMSIZ);
+ oecls_dev->old_filter_state = old_state;
+ oecls_dev->netdev = netdev;
+ get_netdev_queue_info(oecls_dev);
+ return 0;
+
+out:
+ dev_put(netdev);
+ return ret;
+}
+
+static void clean_oecls_netdev_info(void)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct net_device *netdev;
+ int devid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ oecls_filter_restore(oecls_dev->dev_name, oecls_dev->old_filter_state);
+ netdev = oecls_dev->netdev;
+ if (netdev) {
+ oecls_dev->netdev = NULL;
+ dev_put(netdev);
+ }
+ }
+
+ oecls_netdev_num = 0;
+}
+
+static int init_oecls_netdev_info(char *netdev_str)
+{
+ char *start = netdev_str, *end;
+ int err = -ENODEV;
+
+ while (*start != '\0') {
+ // skip start #
+ end = strchr(start, '#');
+ if (end == start) {
+ start++;
+ continue;
+ }
+
+ // find the last ifname
+ if (!end) {
+ err = init_single_oecls_dev(start, strlen(start));
+ break;
+ }
+
+ err = init_single_oecls_dev(start, end - start);
+ if (err)
+ break;
+ start = end + 1;
+ }
+
+ return err;
+}
+
+struct oecls_numa_info *get_oecls_numa_info(unsigned int nid)
+{
+ if (nid >= oecls_numa_num)
+ return NULL;
+ return &oecls_numa_info_table[nid];
+}
+
+static void clean_oecls_numa_info(void)
+{
+ oecls_numa_num = 0;
+ kfree(oecls_numa_info_table);
+}
+
+static void init_numa_avail_cpus(int nid, struct oecls_numa_info *numa_info)
+{
+ int cpu;
+
+ oecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)),
+ cpumask_pr_args(cpumask_of_node(nid)));
+
+ bitmap_zero(numa_info->avail_cpus, OECLS_MAX_CPU_NUM);
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ if (cpu >= OECLS_MAX_CPU_NUM)
+ return;
+ set_bit(cpu, numa_info->avail_cpus);
+ }
+}
+
+static void clean_oecls_rxq(void)
+{
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_netdev_info *oecls_dev;
+ struct oecls_numa_info *numa_info;
+ int nid, devid;
+
+ for_each_oecls_numa(nid, numa_info) {
+ for_each_oecls_netdev(devid, oecls_dev) {
+ bound_dev = &numa_info->bound_dev[devid];
+ kfree(bound_dev->cluster_info);
+ }
+ }
+}
+
+static int init_numa_rxq_bitmap(int nid, struct oecls_numa_info *numa_info)
+{
+ int bound_rxq_num, cluster_id, cluster_idx, cur_idx;
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_netdev_info *oecls_dev;
+ int rxq_id, devid, cpu, ret = 0;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ bound_rxq_num = 0;
+ bound_dev = &numa_info->bound_dev[devid];
+ bitmap_zero(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ bound_dev->cluster_info = kcalloc(oecls_cluster_per_numa,
+ sizeof(*bound_dev->cluster_info), GFP_ATOMIC);
+ if (!bound_dev->cluster_info) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) {
+ cpu = oecls_dev->rxq[rxq_id].affinity_cpu;
+ if (cpu_to_node(cpu) == nid) {
+ set_bit(rxq_id, bound_dev->bitmap_rxq);
+ cluster_id = cpu / oecls_cluster_cpu_num;
+ cluster_idx = cluster_id % oecls_cluster_per_numa;
+ bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id;
+ cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++;
+ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id;
+ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 1;
+ bound_rxq_num++;
+ oecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n",
+ cpu, cluster_id, cluster_idx, rxq_id, cur_idx);
+ }
+ }
+
+ oecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bit_num:%d, bitmap_rxq:%*pbl\n",
+ nid, devid, oecls_dev->dev_name, oecls_dev->rxq_num,
+ bound_rxq_num, OECLS_MAX_RXQ_NUM_PER_DEV, bound_dev->bitmap_rxq);
+ }
+ return ret;
+
+out:
+ clean_oecls_rxq();
+ return ret;
+}
+
+static int get_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev)
+{
+ int cpu = smp_processor_id();
+ int cluster_id = cpu / oecls_cluster_cpu_num;
+ int i, j, rxq_id;
+
+ for (i = 0; i < oecls_cluster_per_numa; i++) {
+ if (cluster_id != bound_dev->cluster_info[i].cluster_id)
+ continue;
+ for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) {
+ if (bound_dev->cluster_info[i].rxqs[j].status == 1) {
+ bound_dev->cluster_info[i].rxqs[j].status = 2;
+ rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id;
+ oecls_debug("cluster:%d cpu:%d alloc rxq_id:%d\n",
+ cluster_id, cpu, rxq_id);
+ return rxq_id;
+ }
+ }
+ }
+ oecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu);
+ return -1;
+}
+
+static int put_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev, int rxq_id)
+{
+ int i, j;
+
+ for (i = 0; i < oecls_cluster_per_numa; i++) {
+ for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) {
+ if (bound_dev->cluster_info[i].rxqs[j].status == 2 &&
+ bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) {
+ bound_dev->cluster_info[i].rxqs[j].status = 1;
+ oecls_debug("free rxq_id:%d\n", rxq_id);
+ return 0;
+ }
+ }
+ }
+ oecls_debug("no match malloced rxq_id:%d\n", rxq_id);
+ return -1;
+}
+
+int alloc_rxq_id(int nid, int devid)
+{
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_numa_info *numa_info;
+ int rxq_id;
+
+ numa_info = get_oecls_numa_info(nid);
+ if (!numa_info) {
+ oecls_error("error nid:%d\n", nid);
+ return -EINVAL;
+ }
+
+ if (devid >= OECLS_MAX_NETDEV_NUM) {
+ oecls_error("error bound_dev index:%d\n", devid);
+ return -EINVAL;
+ }
+ bound_dev = &numa_info->bound_dev[devid];
+
+ if (strategy == 1) {
+ rxq_id = get_cluster_rxq(bound_dev);
+ if (rxq_id < 0 || rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV)
+ pr_info("failed to get rxq_id:%d in cluster, try numa\n", rxq_id);
+ else
+ goto found;
+ }
+
+ rxq_id = find_first_bit(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) {
+ oecls_error("error rxq_id:%d\n", rxq_id);
+ return -EINVAL;
+ }
+
+found:
+ clear_bit(rxq_id, bound_dev->bitmap_rxq);
+ oecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id);
+ return rxq_id;
+}
+
+void free_rxq_id(int nid, int devid, int rxq_id)
+{
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_numa_info *numa_info;
+
+ numa_info = get_oecls_numa_info(nid);
+ if (!numa_info) {
+ oecls_error("error nid:%d\n", nid);
+ return;
+ }
+
+ if (devid >= OECLS_MAX_NETDEV_NUM) {
+ oecls_error("error bound_dev index:%d\n", devid);
+ return;
+ }
+ bound_dev = &numa_info->bound_dev[devid];
+
+ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) {
+ oecls_error("error rxq_id:%d\n", rxq_id);
+ return;
+ }
+
+ if (strategy == 1)
+ put_cluster_rxq(bound_dev, rxq_id);
+
+ if (test_bit(rxq_id, bound_dev->bitmap_rxq)) {
+ oecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id);
+ return;
+ }
+
+ set_bit(rxq_id, bound_dev->bitmap_rxq);
+ oecls_debug("free nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id);
+}
+
+static int init_oecls_numa_info(void)
+{
+ struct oecls_numa_info *numa_info;
+ int nid, ret = 0;
+
+ oecls_numa_num = num_online_nodes();
+ oecls_numa_info_table = kcalloc(oecls_numa_num, sizeof(*oecls_numa_info_table),
+ GFP_ATOMIC);
+ if (!oecls_numa_info_table) {
+ ret = -ENOMEM;
+ oecls_error("oecls_numa_info_table alloc failed:%d\n", ret);
+ return ret;
+ }
+
+ oecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(smp_processor_id()));
+ oecls_cluster_per_numa = (nr_cpu_ids / oecls_cluster_cpu_num) / oecls_numa_num;
+ oecls_debug("oecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n",
+ oecls_numa_num, oecls_cluster_per_numa, oecls_cluster_cpu_num);
+
+ for_each_oecls_numa(nid, numa_info)
+ init_numa_avail_cpus(nid, numa_info);
+
+ return ret;
+}
+
+static int alloc_available_cpu(int nid, struct oecls_numa_info *numa_info)
+{
+ int cpu;
+
+ cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM);
+ if (cpu >= OECLS_MAX_CPU_NUM) {
+ oecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu);
+ return -1;
+ }
+
+ clear_bit(cpu, numa_info->avail_cpus);
+ return cpu;
+}
+
+static void add_netdev_irq_affinity_cpu(struct oecls_netdev_info *oecls_dev, int rxq_id, int cpu)
+{
+ struct oecls_netdev_queue_info *rxq_info;
+
+ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV)
+ return;
+
+ rxq_info = &oecls_dev->rxq[rxq_id];
+ rxq_info->affinity_cpu = cpu;
+}
+
+static void config_affinity_strategy_default(struct oecls_netdev_info *oecls_dev)
+{
+ struct oecls_numa_info *numa_info;
+ int rxq_num = oecls_dev->rxq_num;
+ int rxq_per_numa = rxq_num / oecls_numa_num;
+ int remain = rxq_num - rxq_per_numa * oecls_numa_num;
+ int numa_rxq_id, rxq_id, nid, cpu;
+
+ oecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", oecls_dev->dev_name,
+ rxq_num, rxq_per_numa, remain);
+
+ // average config rxq to every numa
+ for_each_oecls_numa(nid, numa_info) {
+ for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) {
+ cpu = alloc_available_cpu(nid, numa_info);
+ if (cpu < 0)
+ break;
+
+ rxq_id = rxq_per_numa * nid + numa_rxq_id;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu);
+ }
+ }
+
+ if (!remain)
+ return;
+
+ // config remain rxq to every numa
+ numa_rxq_id = 0;
+ for_each_oecls_numa(nid, numa_info) {
+ if (numa_rxq_id >= remain)
+ break;
+ cpu = alloc_available_cpu(nid, numa_info);
+ if (cpu < 0)
+ break;
+
+ rxq_id = rxq_per_numa * oecls_numa_num + numa_rxq_id;
+ numa_rxq_id++;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu);
+ }
+}
+
+static void config_affinity_strategy_cluster(struct oecls_netdev_info *oecls_dev)
+{
+ int rxq_num = oecls_dev->rxq_num;
+ int rxq_per_numa = rxq_num / oecls_numa_num;
+ int remain = rxq_num - rxq_per_numa * oecls_numa_num;
+ int cpu_idx = oecls_cluster_cpu_num - 1;
+ int cluster, cpu, rxq_id = 0, round;
+
+ round = rxq_per_numa < oecls_cluster_per_numa ? rxq_per_numa : oecls_cluster_per_numa;
+ if (remain > 0)
+ round++;
+ oecls_debug("round=%d\n", round);
+
+ while (rxq_id < oecls_dev->rxq_num) {
+ for (cluster = 0; cluster < oecls_cluster_per_numa * oecls_numa_num; cluster++) {
+ if (cluster % oecls_cluster_per_numa >= round)
+ continue;
+ cpu = cluster * oecls_cluster_cpu_num + cpu_idx;
+ if (rxq_id >= oecls_dev->rxq_num)
+ break;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu);
+ }
+ cpu_idx--;
+ if (--cpu_idx < 0)
+ cpu_idx = oecls_cluster_cpu_num - 1;
+ }
+}
+
+static void config_affinity_strategy_numa(struct oecls_netdev_info *oecls_dev)
+{
+ int rxq_num = oecls_dev->rxq_num;
+ int rxq_per_numa = rxq_num / oecls_numa_num;
+ int cpu_per_numa = nr_cpu_ids / oecls_numa_num;
+ int remain = rxq_num - rxq_per_numa * oecls_numa_num;
+ struct oecls_numa_info *numa_info;
+ int numa_start_cpu, numa_cpu_id;
+ int rxq_id = 0, nid, cpu;
+
+ for_each_oecls_numa(nid, numa_info) {
+ numa_start_cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM);
+ for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) {
+ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa);
+ if (rxq_id >= oecls_dev->rxq_num)
+ break;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu);
+ }
+ if (remain-- > 0) {
+ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa);
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu);
+ }
+ }
+}
+
+static void config_affinity_strategy_custom(struct oecls_netdev_info *oecls_dev)
+{
+ oecls_debug("dev=%s\n", oecls_dev->dev_name);
+}
+
+static void config_affinity_strategy(void)
+{
+ struct oecls_netdev_info *oecls_dev;
+ int devid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ switch (strategy) {
+ case 1:
+ config_affinity_strategy_cluster(oecls_dev);
+ break;
+ case 2:
+ config_affinity_strategy_numa(oecls_dev);
+ break;
+ case 3:
+ config_affinity_strategy_custom(oecls_dev);
+ break;
+ case 0:
+ default:
+ config_affinity_strategy_default(oecls_dev);
+ break;
+ }
+ }
+}
+
+static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu)
+{
+ int err = 0;
+
+ err = irq_set_affinity(irq, get_cpu_mask(cpu));
+ oecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err);
+}
+
+static void enable_affinity_strategy(void)
+{
+ struct oecls_netdev_queue_info *rxq_info;
+ struct oecls_netdev_info *oecls_dev;
+ int rxq_id, devid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) {
+ rxq_info = &oecls_dev->rxq[rxq_id];
+ irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu);
+ }
+ }
+}
+
+static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id,
+ const struct cpumask *cpu_mask)
+{
+ int err = 0;
+
+ err = netif_set_xps_queue(netdev, cpu_mask, rxq_id);
+ oecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id,
+ cpumask_pr_args(cpu_mask), err);
+}
+
+static void set_netdev_xps_queue(bool enable)
+{
+ const struct cpumask clear_mask = { 0 };
+ struct oecls_netdev_info *oecls_dev;
+ const struct cpumask *cpu_mask;
+ int rxq_id, devid, cpu, nid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) {
+ cpu = oecls_dev->rxq[rxq_id].affinity_cpu;
+ nid = cpu_to_node(cpu);
+ if (enable)
+ cpu_mask = cpumask_of_node(nid);
+ else
+ cpu_mask = &clear_mask;
+
+ netif_set_xps_queue_wrapper(oecls_dev->netdev, rxq_id, cpu_mask);
+ }
+ }
+}
+
+static __init int oecls_init(void)
+{
+ struct oecls_numa_info *numa_info;
+ int nid, err;
+
+ if (!check_params())
+ return -EINVAL;
+
+ err = init_oecls_numa_info();
+ if (err)
+ return err;
+
+ err = init_oecls_netdev_info(ifname);
+ if (err)
+ goto clean_numa;
+
+ // Set irq affinity
+ config_affinity_strategy();
+ enable_affinity_strategy();
+
+ // Calculate rxq bounded to one numa
+ for_each_oecls_numa(nid, numa_info) {
+ err = init_numa_rxq_bitmap(nid, numa_info);
+ if (err)
+ goto clean_rxq;
+ }
+
+#ifdef CONFIG_XPS
+ set_netdev_xps_queue(true);
+#endif
+
+ if (mode == 0)
+ oecls_ntuple_res_init();
+ else
+ oecls_flow_res_init();
+
+ return 0;
+
+clean_rxq:
+clean_numa:
+ clean_oecls_netdev_info();
+ clean_oecls_numa_info();
+ return err;
+}
+
+static __exit void oecls_exit(void)
+{
+ if (mode == 0)
+ oecls_ntuple_res_clean();
+ else
+ oecls_flow_res_clean();
+
+#ifdef CONFIG_XPS
+ set_netdev_xps_queue(false);
+#endif
+
+ clean_oecls_rxq();
+ clean_oecls_netdev_info();
+ clean_oecls_numa_info();
+}
+
+module_init(oecls_init);
+module_exit(oecls_exit);
+
+MODULE_DESCRIPTION("oenetcls");
+MODULE_LICENSE("GPL v2");
diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c
new file mode 100644
index 000000000000..3986d86efe83
--- /dev/null
+++ b/net/oenetcls/oenetcls_ntuple.c
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/inetdevice.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/inet.h>
+#include <linux/jhash.h>
+#include <net/sock.h>
+#include <trace/hooks/oenetcls.h>
+#include "oenetcls.h"
+
+struct oecls_sk_rule_list oecls_sk_rules, oecls_sk_list;
+
+static void init_oecls_sk_rules(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++)
+ INIT_HLIST_HEAD(oecls_sk_rules.hash + i);
+ mutex_init(&oecls_sk_rules.mutex);
+}
+
+static inline struct hlist_head *get_rule_hashlist(u32 dip4, u16 dport)
+{
+ return oecls_sk_rules.hash + (jhash_2words(dip4, dport, 0) & OECLS_SK_RULE_HASHMASK);
+}
+
+static inline struct hlist_head *get_sk_hashlist(void *sk)
+{
+ return oecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & OECLS_SK_RULE_HASHMASK);
+}
+
+static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action,
+ int ruleid, int nid)
+{
+ struct hlist_head *hlist = get_rule_hashlist(dip4, dport);
+ struct hlist_head *sk_hlist = get_sk_hashlist(sk);
+ struct oecls_sk_rule *rule;
+ struct oecls_sk_entry *entry;
+
+ rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!rule || !entry)
+ goto out;
+
+ rule->sk = sk;
+ rule->dip4 = dip4;
+ rule->dport = dport;
+ rule->devid = devid;
+ rule->action = action;
+ rule->ruleid = ruleid;
+ rule->nid = nid;
+ hlist_add_head(&rule->node, hlist);
+
+ entry->sk = sk;
+ entry->sk_rule_hash = jhash_2words(dip4, dport, 0);
+ hlist_add_head(&entry->node, sk_hlist);
+ return;
+out:
+ oecls_debug("alloc failed rule:%p entry:%p\n", rule, entry);
+ kfree(entry);
+ kfree(rule);
+}
+
+static struct oecls_sk_entry *get_sk_entry(void *sk)
+{
+ struct hlist_head *sk_hlist = get_sk_hashlist(sk);
+ struct oecls_sk_entry *entry = NULL;
+
+ hlist_for_each_entry(entry, sk_hlist, node) {
+ if (entry->sk == sk)
+ break;
+ }
+ return entry;
+}
+
+static void del_sk_rule(struct oecls_sk_rule *rule)
+{
+ struct oecls_sk_entry *entry;
+
+ entry = get_sk_entry(rule->sk);
+ if (!entry)
+ return;
+ hlist_del_init(&entry->node);
+ kfree(entry);
+
+ oecls_debug("del rule=%p\n", rule);
+ hlist_del_init(&rule->node);
+ kfree(rule);
+}
+
+static struct oecls_sk_rule *get_sk_rule(int devid, u32 dip4, u16 dport)
+{
+ struct hlist_head *hlist = get_rule_hashlist(dip4, dport);
+ struct oecls_sk_rule *rule = NULL;
+
+ hlist_for_each_entry(rule, hlist, node) {
+ if (rule->devid == devid && rule->dip4 == dip4 && rule->dport == dport)
+ break;
+ }
+ return rule;
+}
+
+static struct oecls_sk_rule *get_rule_from_sk(int devid, void *sk)
+{
+ struct oecls_sk_rule *rule = NULL;
+ struct oecls_sk_entry *entry;
+ struct hlist_head *hlist;
+
+ entry = get_sk_entry(sk);
+ if (!entry)
+ return NULL;
+
+ hlist = oecls_sk_rules.hash + (entry->sk_rule_hash & OECLS_SK_RULE_HASHMASK);
+ hlist_for_each_entry(rule, hlist, node) {
+ if (rule->devid == devid && rule->sk == sk)
+ break;
+ }
+ return rule;
+}
+
+static inline bool reuseport_check(int devid, u32 dip4, u16 dport)
+{
+ return !!get_sk_rule(devid, dip4, dport);
+}
+
+static u32 get_first_ip4_addr(struct net *net)
+{
+ struct in_device *in_dev;
+ struct net_device *dev;
+ struct in_ifaddr *ifa;
+ u32 dip4 = 0;
+
+ rtnl_lock();
+ rcu_read_lock();
+ for_each_netdev(net, dev) {
+ if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP))
+ continue;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (!strcmp(dev->name, ifa->ifa_label)) {
+ dip4 = ifa->ifa_local;
+ oecls_debug("dev: %s, dip4:%pI4\n", dev->name, &dip4);
+ goto out;
+ }
+ }
+ }
+out:
+ rcu_read_unlock();
+ rtnl_unlock();
+ return dip4;
+}
+
+static void get_sk_rule_addr(struct sock *sk, u32 *dip4, u16 *dport)
+{
+ *dport = htons(sk->sk_num);
+
+ if (!match_ip_flag) {
+ *dip4 = 0;
+ return;
+ }
+
+ if (sk->sk_rcv_saddr)
+ *dip4 = sk->sk_rcv_saddr;
+ else
+ *dip4 = get_first_ip4_addr(sock_net(sk));
+}
+
+static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc)
+{
+ struct ethtool_rxnfc nfccmd;
+ int err;
+
+ nfccmd.cmd = ETHTOOL_SRXCLSRLDEL;
+ nfccmd.fs.location = loc;
+ err = send_ethtool_ioctl(ctx, &nfccmd);
+ if (err < 0)
+ oecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc);
+ return err;
+}
+
+static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc)
+{
+ if (loc >= rmgr->size) {
+ oecls_error("rmgr: Location out of range\n");
+ return -1;
+ }
+
+ set_bit(loc, rmgr->slot);
+ return 0;
+}
+
+static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp)
+{
+ __u32 loc, slot_num;
+
+ if (rmgr->driver_select)
+ return 0;
+
+ loc = rmgr->size - 1;
+ slot_num = loc / BITS_PER_LONG;
+ if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) {
+ loc -= 1 + (loc % BITS_PER_LONG);
+ slot_num--;
+ }
+
+ while (loc < rmgr->size && !~(rmgr->slot[slot_num])) {
+ loc -= BITS_PER_LONG;
+ slot_num--;
+ }
+
+ while (loc < rmgr->size && test_bit(loc, rmgr->slot))
+ loc--;
+
+ if (loc < rmgr->size) {
+ fsp->location = loc;
+ return rmgr_ins(rmgr, loc);
+ }
+
+ return -1;
+}
+
+static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select)
+{
+ struct ethtool_rxnfc nfccmd;
+ int err;
+
+ nfccmd.cmd = ETHTOOL_GRXCLSRLCNT;
+ nfccmd.data = 0;
+ err = send_ethtool_ioctl(ctx, &nfccmd);
+ *count = nfccmd.rule_cnt;
+ if (driver_select)
+ *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL);
+ if (err < 0)
+ oecls_debug("rxclass: Cannot get RX class rule count\n");
+
+ return err;
+}
+
+static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr)
+{
+ struct ethtool_rxnfc *nfccmd;
+ __u32 *rule_locs;
+ int i, err = 0;
+
+ memset(rmgr, 0, sizeof(*rmgr));
+ err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select);
+ if (err < 0)
+ return err;
+
+ if (rmgr->driver_select)
+ return err;
+
+ nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC);
+ if (!nfccmd) {
+ oecls_error("rmgr: Cannot allocate memory for RX class rule locations\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ nfccmd->cmd = ETHTOOL_GRXCLSRLALL;
+ nfccmd->rule_cnt = rmgr->n_rules;
+ err = send_ethtool_ioctl(ctx, nfccmd);
+ if (err < 0) {
+ oecls_debug("rmgr: Cannot get RX class rules\n");
+ goto out;
+ }
+
+ rmgr->size = nfccmd->data;
+ if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) {
+ oecls_error("rmgr: Invalid RX class rules table size\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC);
+ if (!rmgr->slot) {
+ oecls_error("rmgr: Cannot allocate memory for RX class rules\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ rule_locs = nfccmd->rule_locs;
+ for (i = 0; i < rmgr->n_rules; i++) {
+ err = rmgr_ins(rmgr, rule_locs[i]);
+ if (err < 0)
+ break;
+ }
+
+out:
+ kfree(nfccmd);
+ return err;
+}
+
+static void rmgr_cleanup(struct rmgr_ctrl *rmgr)
+{
+ kfree(rmgr->slot);
+ rmgr->slot = NULL;
+ rmgr->size = 0;
+}
+
+static int rmgr_set_location(struct cmd_context *ctx,
+ struct ethtool_rx_flow_spec *fsp)
+{
+ struct rmgr_ctrl rmgr;
+ int ret;
+
+ ret = rmgr_init(ctx, &rmgr);
+ if (ret < 0)
+ goto out;
+
+ ret = rmgr_find_empty_slot(&rmgr, fsp);
+out:
+ rmgr_cleanup(&rmgr);
+ return ret;
+}
+
+static int rxclass_rule_ins(struct cmd_context *ctx,
+ struct ethtool_rx_flow_spec *fsp, u32 rss_context)
+{
+ struct ethtool_rxnfc nfccmd;
+ u32 loc = fsp->location;
+ int ret;
+
+ if (loc & RX_CLS_LOC_SPECIAL) {
+ ret = rmgr_set_location(ctx, fsp);
+ if (ret < 0)
+ return ret;
+ }
+
+ nfccmd.cmd = ETHTOOL_SRXCLSRLINS;
+ nfccmd.rss_context = rss_context;
+ nfccmd.fs = *fsp;
+ ret = send_ethtool_ioctl(ctx, &nfccmd);
+ if (ret < 0) {
+ oecls_debug("Can not insert the clasification rule\n");
+ return ret;
+ }
+
+ if (loc & RX_CLS_LOC_SPECIAL)
+ oecls_debug("Added rule with ID %d\n", nfccmd.fs.location);
+
+ return 0;
+}
+
+static void flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp,
+ struct ethtool_rx_ntuple_flow_spec *ntuple)
+{
+ int i;
+
+ memset(ntuple, ~0, sizeof(*ntuple));
+ ntuple->flow_type = fsp->flow_type;
+ ntuple->action = fsp->ring_cookie;
+ memcpy_r(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u));
+ memcpy_r(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u));
+ for (i = 0; i < sizeof(ntuple->m_u); i++)
+ ntuple->m_u.hdata[i] ^= 0xFF;
+ ntuple->flow_type &= ~FLOW_EXT;
+}
+
+static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fsp)
+{
+ struct ethtool_rx_ntuple ntuplecmd;
+ struct ethtool_value eval;
+ int ret = 0;
+
+ flow_spec_to_ntuple(fsp, &ntuplecmd.fs);
+
+ eval.cmd = ETHTOOL_GFLAGS;
+ ret = send_ethtool_ioctl(ctx, &eval);
+ if (ret || !(eval.data & ETH_FLAG_NTUPLE))
+ return -1;
+
+ ntuplecmd.cmd = ETHTOOL_SRXNTUPLE;
+ ret = send_ethtool_ioctl(ctx, &ntuplecmd);
+ if (ret)
+ oecls_debug("Cannot add new rule via N-tuple, ret:%d\n", ret);
+
+ return ret;
+}
+
+static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del)
+{
+ struct ethtool_rx_flow_spec *fsp, rx_rule_fs;
+ u32 rss_context = 0;
+ int ret;
+
+ oecls_debug("is_del:%d netdev:%s, dip4:%pI4, dport:%d, action:%d, ruleid:%u, del_ruleid:%u\n",
+ is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, ctx->ruleid,
+ ctx->del_ruleid);
+
+ if (is_del)
+ return rxclass_rule_del(ctx, ctx->del_ruleid);
+
+ ctx->ret_loc = -1;
+
+ fsp = &rx_rule_fs;
+ memset(fsp, 0, sizeof(*fsp));
+ fsp->flow_type = TCP_V4_FLOW;
+ fsp->location = RX_CLS_LOC_ANY;
+ fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4;
+ fsp->h_u.tcp_ip4_spec.pdst = ctx->dport;
+ if (ctx->dip4)
+ fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL;
+ fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL;
+ if (ctx->ruleid)
+ fsp->location = ctx->ruleid;
+ fsp->ring_cookie = ctx->action;
+
+ ret = do_srxntuple(ctx, &rx_rule_fs);
+ if (!ret)
+ return 0;
+
+ ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context);
+ if (!ret)
+ ctx->ret_loc = rx_rule_fs.location;
+ return ret;
+}
+
+static void del_ntuple_rule(struct sock *sk)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct cmd_context ctx = { 0 };
+ struct oecls_sk_rule *rule;
+ int devid;
+ u16 dport;
+ u32 dip4;
+ int err;
+
+ get_sk_rule_addr(sk, &dip4, &dport);
+
+ mutex_lock(&oecls_sk_rules.mutex);
+ for_each_oecls_netdev(devid, oecls_dev) {
+ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ);
+ rule = get_rule_from_sk(devid, sk);
+ if (!rule) {
+ oecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n",
+ sk, devid, &dip4, ntohs(dport));
+ continue;
+ }
+
+ // Config Ntuple rule to dev
+ ctx.del_ruleid = rule->ruleid;
+ err = cfg_ethtool_rule(&ctx, true);
+ if (err) {
+ oecls_error("del sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n",
+ sk, rule->nid, devid, rule->action, rule->ruleid, err);
+ }
+
+ // Free the bound queue
+ free_rxq_id(rule->nid, devid, rule->action);
+
+ // Delete sk rule
+ del_sk_rule(rule);
+ }
+ mutex_unlock(&oecls_sk_rules.mutex);
+}
+
+static void add_ntuple_rule(struct sock *sk)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct cmd_context ctx = { 0 };
+ int cpu = smp_processor_id();
+ int nid = cpu_to_node(cpu);
+ int rxq_id;
+ int devid;
+ int err;
+
+ if (check_appname(current->comm))
+ return;
+ get_sk_rule_addr(sk, &ctx.dip4, &ctx.dport);
+
+ mutex_lock(&oecls_sk_rules.mutex);
+ for_each_oecls_netdev(devid, oecls_dev) {
+ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ);
+ if (reuseport_check(devid, ctx.dip4, ctx.dport)) {
+ oecls_error("dip4:%pI4, dport:%d reuse!\n", &ctx.dip4, ctx.dport);
+ continue;
+ }
+
+ // Calculate the bound queue
+ rxq_id = alloc_rxq_id(nid, devid);
+ if (rxq_id < 0)
+ continue;
+
+ // Config Ntuple rule to dev
+ ctx.action = (u16)rxq_id;
+ err = cfg_ethtool_rule(&ctx, false);
+ if (err) {
+ oecls_error("add sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n",
+ sk, nid, devid, ctx.action, ctx.ret_loc, err);
+ continue;
+ }
+
+ // Add sk rule
+ add_sk_rule(devid, ctx.dip4, ctx.dport, sk, ctx.action, ctx.ret_loc, nid);
+ }
+ mutex_unlock(&oecls_sk_rules.mutex);
+}
+
+static void ethtool_cfg_rxcls(void *data, struct sock *sk, int is_del)
+{
+ if (sk->sk_state != TCP_LISTEN)
+ return;
+
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ return;
+
+ oecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, ip:%pI4, port:%d\n", smp_processor_id(),
+ current->comm, sk, is_del, &sk->sk_rcv_saddr, (u16)sk->sk_num);
+
+ if (is_del)
+ del_ntuple_rule(sk);
+ else
+ add_ntuple_rule(sk);
+}
+
+static void clean_oecls_sk_rules(void)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct cmd_context ctx = { 0 };
+ struct oecls_sk_rule *rule;
+ struct hlist_head *hlist;
+ struct hlist_node *n;
+ unsigned int i;
+ int err;
+
+ mutex_lock(&oecls_sk_rules.mutex);
+ for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) {
+ hlist = &oecls_sk_rules.hash[i];
+
+ hlist_for_each_entry_safe(rule, n, hlist, node) {
+ oecls_dev = get_oecls_netdev_info(rule->devid);
+ if (!oecls_dev)
+ continue;
+ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ);
+ ctx.del_ruleid = rule->ruleid;
+ err = cfg_ethtool_rule(&ctx, true);
+ oecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk,
+ rule->devid, rule->action, rule->ruleid, err);
+
+ hlist_del(&rule->node);
+ oecls_debug("clean rule=%p\n", rule);
+ kfree(rule);
+ }
+ }
+ mutex_unlock(&oecls_sk_rules.mutex);
+}
+
+void oecls_ntuple_res_init(void)
+{
+ init_oecls_sk_rules();
+ register_trace_ethtool_cfg_rxcls(ðtool_cfg_rxcls, NULL);
+}
+
+void oecls_ntuple_res_clean(void)
+{
+ unregister_trace_ethtool_cfg_rxcls(ðtool_cfg_rxcls, NULL);
+ clean_oecls_sk_rules();
+}
--
2.33.0
2
1
Pu Lehui (1):
tracing: Limit access to parser->buffer when trace_get_user failed
Steven Rostedt (1):
tracing: Remove unneeded goto out logic
kernel/trace/trace.c | 33 +++++++++++++++------------------
kernel/trace/trace.h | 8 +++++++-
2 files changed, 22 insertions(+), 19 deletions(-)
--
2.34.1
2
3
Jinjiang Tu (2):
mm/vmscan: don't try to reclaim hwpoison folio
mm/vmscan: fix hwpoisoned large folio handling in shrink_folio_list
mm/vmscan.c | 10 ++++++++++
1 file changed, 10 insertions(+)
--
2.43.0
2
3

26 Aug '25
hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ICS3XV
CVE: NA
--------------------------------
This introduces a kind of network optimization method named oenetcls. It
can configure the ntuple rule, and bind interrupt to the netdev queue
automatically.
Signed-off-by: Yue Haibing <yuehaibing(a)huawei.com>
Signed-off-by: Wang Liang <wangliang74(a)huawei.com>
Signed-off-by: Liu Jian <liujian56(a)huawei.com>
Signed-off-by: yuelg <yuelg(a)chinaunicom.cn>
---
drivers/hooks/Kconfig | 10 +
drivers/hooks/vendor_hooks.c | 8 +
include/net/netdev_rx_queue.h | 2 +-
include/trace/hooks/oenetcls.h | 44 ++
kernel/irq/irqdesc.c | 2 +-
net/Kconfig | 1 +
net/Makefile | 1 +
net/core/dev.c | 20 +
net/ipv4/af_inet.c | 4 +
net/ipv4/tcp.c | 7 +
net/oenetcls/Kconfig | 10 +
net/oenetcls/Makefile | 8 +
net/oenetcls/asmdefs.h | 61 ++
net/oenetcls/memcpy-sve.S | 157 +++++
net/oenetcls/oenetcls.h | 177 ++++++
net/oenetcls/oenetcls_flow.c | 403 ++++++++++++
net/oenetcls/oenetcls_main.c | 1076 ++++++++++++++++++++++++++++++++
net/oenetcls/oenetcls_ntuple.c | 565 +++++++++++++++++
18 files changed, 2554 insertions(+), 2 deletions(-)
create mode 100644 include/trace/hooks/oenetcls.h
create mode 100644 net/oenetcls/Kconfig
create mode 100644 net/oenetcls/Makefile
create mode 100644 net/oenetcls/asmdefs.h
create mode 100644 net/oenetcls/memcpy-sve.S
create mode 100644 net/oenetcls/oenetcls.h
create mode 100644 net/oenetcls/oenetcls_flow.c
create mode 100644 net/oenetcls/oenetcls_main.c
create mode 100644 net/oenetcls/oenetcls_ntuple.c
diff --git a/drivers/hooks/Kconfig b/drivers/hooks/Kconfig
index 6a00168e67ad..90b0f6ea4040 100644
--- a/drivers/hooks/Kconfig
+++ b/drivers/hooks/Kconfig
@@ -20,4 +20,14 @@ config VENDOR_BOND_HOOKS
Allow vendor modules to attach bonding driver hooks defined via
DECLARE_HOOK or DECLARE_RESTRICTED_HOOK.
+config OENETCLS_HOOKS
+ bool "Oenetcls driver Hooks"
+ depends on VENDOR_HOOKS
+ default n
+ help
+ Enable oenetcls vendor hooks
+ Allow vendor modules to attach oenetcls hooks defined via
+ DECLARE_HOOK or DECLARE_RESTRICTED_HOOK.
+ Use OENETCLS && OENETCLS_HOOKS to enable oenetcls feature.
+
endmenu
diff --git a/drivers/hooks/vendor_hooks.c b/drivers/hooks/vendor_hooks.c
index 85bda58159f6..d9b85b57a742 100644
--- a/drivers/hooks/vendor_hooks.c
+++ b/drivers/hooks/vendor_hooks.c
@@ -9,6 +9,7 @@
#define CREATE_TRACE_POINTS
#include <trace/hooks/vendor_hooks.h>
#include <trace/hooks/bonding.h>
+#include <trace/hooks/oenetcls.h>
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -18,3 +19,10 @@
#ifdef CONFIG_VENDOR_BOND_HOOKS
EXPORT_TRACEPOINT_SYMBOL_GPL(vendor_bond_check_dev_link);
#endif
+
+#ifdef CONFIG_OENETCLS_HOOKS
+EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_flow_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_set_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_timeout);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ethtool_cfg_rxcls);
+#endif
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 377f43745abf..3fb5d8eb18fc 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -22,7 +22,7 @@ struct netdev_rx_queue {
struct xsk_buff_pool *pool;
#endif
- KABI_RESERVE(1)
+ KABI_USE(1, void *__rcu oecls_ftb)
KABI_RESERVE(2)
KABI_RESERVE(3)
KABI_RESERVE(4)
diff --git a/include/trace/hooks/oenetcls.h b/include/trace/hooks/oenetcls.h
new file mode 100644
index 000000000000..c38545d7a6a2
--- /dev/null
+++ b/include/trace/hooks/oenetcls.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * oenetcls driver Hooks
+ *
+ * Copyright (c) 2025, Huawei Tech. Co., Ltd.
+ */
+
+#ifdef CONFIG_OENETCLS_HOOKS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM oenetcls
+
+#define TRACE_INCLUDE_PATH trace/hooks
+#if !defined(_TRACE_OENETCLS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OENETCLS_H
+#include <linux/tracepoint.h>
+#include <trace/hooks/vendor_hooks.h>
+
+struct sock;
+struct sk_buff;
+struct net_device;
+
+DECLARE_HOOK(oecls_flow_update,
+TP_PROTO(struct sock *sk),
+TP_ARGS(sk));
+
+DECLARE_HOOK(oecls_set_cpu,
+TP_PROTO(struct sk_buff *skb),
+TP_ARGS(skb));
+
+DECLARE_HOOK(oecls_timeout,
+TP_PROTO(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id, bool *ret),
+TP_ARGS(dev, rxq_index, flow_id, filter_id, ret));
+
+DECLARE_HOOK(ethtool_cfg_rxcls,
+TP_PROTO(struct sock *sk, int is_del),
+TP_ARGS(sk, is_del));
+
+#endif
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+#endif
+
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 46094f0c9fcd..29f4101585cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -383,7 +383,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return mtree_load(&sparse_irqs, irq);
}
-#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
+#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_OENETCLS)
EXPORT_SYMBOL_GPL(irq_to_desc);
#endif
diff --git a/net/Kconfig b/net/Kconfig
index 2fc1860faeb4..bea9c2529bb1 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -72,6 +72,7 @@ source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
source "net/smc/Kconfig"
source "net/xdp/Kconfig"
+source "net/oenetcls/Kconfig"
config NET_HANDSHAKE
bool
diff --git a/net/Makefile b/net/Makefile
index 4c4dc535453d..4ffee8a3c427 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -79,4 +79,5 @@ obj-$(CONFIG_NET_NCSI) += ncsi/
obj-$(CONFIG_XDP_SOCKETS) += xdp/
obj-$(CONFIG_MPTCP) += mptcp/
obj-$(CONFIG_MCTP) += mctp/
+obj-$(CONFIG_OENETCLS) += oenetcls/
obj-$(CONFIG_NET_HANDSHAKE) += handshake/
diff --git a/net/core/dev.c b/net/core/dev.c
index cbb4bd4718cd..a0624c801a35 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -154,6 +154,7 @@
#include <linux/once_lite.h>
#include <net/netdev_rx_queue.h>
#include <linux/if_caqm.h>
+#include <trace/hooks/oenetcls.h>
#include "dev.h"
#include "net-sysfs.h"
@@ -4727,6 +4728,11 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
bool expire = true;
unsigned int cpu;
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_oecls_timeout(dev, rxq_index, flow_id, filter_id, &expire);
+ if (expire)
+ return true;
+#endif
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
@@ -5814,6 +5820,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
}
}
#endif
+
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_oecls_set_cpu(skb);
+#endif
+
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
@@ -5848,6 +5859,12 @@ void netif_receive_skb_list_internal(struct list_head *head)
}
}
#endif
+
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ list_for_each_entry_safe(skb, next, head, list)
+ trace_oecls_set_cpu(skb);
+#endif
+
__netif_receive_skb_list(head);
rcu_read_unlock();
}
@@ -9960,6 +9977,9 @@ int __netdev_update_features(struct net_device *dev)
return err < 0 ? 0 : 1;
}
+#if IS_ENABLED(CONFIG_OENETCLS)
+EXPORT_SYMBOL(__netdev_update_features);
+#endif
/**
* netdev_update_features - recalculate device features
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f336b2ddf972..ee224b196666 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -121,6 +121,7 @@
#include <net/compat.h>
#include <trace/events/sock.h>
+#include <trace/hooks/oenetcls.h>
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
@@ -219,6 +220,9 @@ int __inet_listen_sk(struct sock *sk, int backlog)
return err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_ethtool_cfg_rxcls(sk, 0);
+#endif
}
return 0;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c2419903f0e4..3e45b736aa10 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
+#include <trace/hooks/oenetcls.h>
/* Track pending CMSGs. */
enum {
@@ -2577,6 +2578,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_oecls_flow_update(sk);
+#endif
if (sk_can_busy_loop(sk) &&
skb_queue_empty_lockless(&sk->sk_receive_queue) &&
sk->sk_state == TCP_ESTABLISHED)
@@ -2940,6 +2944,9 @@ void __tcp_close(struct sock *sk, long timeout)
void tcp_close(struct sock *sk, long timeout)
{
lock_sock(sk);
+#if IS_ENABLED(CONFIG_OENETCLS_HOOKS)
+ trace_ethtool_cfg_rxcls(sk, 1);
+#endif
__tcp_close(sk, timeout);
release_sock(sk);
if (!sk->sk_net_refcnt)
diff --git a/net/oenetcls/Kconfig b/net/oenetcls/Kconfig
new file mode 100644
index 000000000000..2ab980258c31
--- /dev/null
+++ b/net/oenetcls/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config OENETCLS
+ tristate "Network classification"
+ depends on OENETCLS_HOOKS
+ default n
+ help
+ Allows to configure ntuple rule, and bind interrupt to netdev
+ automatically.
+ Use OENETCLS && OENETCLS_HOOKS to enable oenetcls feature.
+ Use parameter mode to decide running mode.
diff --git a/net/oenetcls/Makefile b/net/oenetcls/Makefile
new file mode 100644
index 000000000000..cdf17ea096d3
--- /dev/null
+++ b/net/oenetcls/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_OENETCLS) = oenetcls.o
+oenetcls-y := oenetcls_main.o oenetcls_ntuple.o oenetcls_flow.o
+ifeq ($(CONFIG_ARM64_SVE),y)
+oenetcls-y += memcpy-sve.o
+endif
+
diff --git a/net/oenetcls/asmdefs.h b/net/oenetcls/asmdefs.h
new file mode 100644
index 000000000000..8138a94c18af
--- /dev/null
+++ b/net/oenetcls/asmdefs.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files. */
+GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name, %function; \
+ .align alignment; \
+name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name, %function; \
+ name:
+
+#define END(name) \
+ .cfi_endproc; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
diff --git a/net/oenetcls/memcpy-sve.S b/net/oenetcls/memcpy-sve.S
new file mode 100644
index 000000000000..106e4c30294c
--- /dev/null
+++ b/net/oenetcls/memcpy-sve.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include "asmdefs.h"
+
+.arch armv8-a+sve
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp1 x6
+#define vlen x6
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+ SVE vectors are used to speedup small copies.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+ cmp count, 128
+ b.hi L(copy_long)
+ cntb vlen
+ cmp count, vlen, lsl 1
+ b.hi L(copy32_128)
+
+ whilelo p0.b, xzr, count
+ whilelo p1.b, vlen, count
+ ld1b z0.b, p0/z, [src, 0, mul vl]
+ ld1b z1.b, p1/z, [src, 1, mul vl]
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z1.b, p1, [dstin, 1, mul vl]
+ ret
+
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ add srcend, src, count
+ add dstend, dstin, count
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
+ ret
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ cbz tmp1, L(return)
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(return):
+ ret
+
+END (__memcpy_aarch64_sve)
diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h
new file mode 100644
index 000000000000..215ae3e7e153
--- /dev/null
+++ b/net/oenetcls/oenetcls.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _NET_OENETCLS_H
+#define _NET_OENETCLS_H
+#include <linux/if.h>
+#include <linux/mutex.h>
+#include <linux/cpufeature.h>
+
+#define OECLS_MAX_NETDEV_NUM 8
+#define OECLS_MAX_RXQ_NUM_PER_DEV 256
+#define OECLS_MAX_CPU_NUM 1024
+
+#define OECLS_TIMEOUT (5 * HZ)
+#define OECLS_NO_FILTER 0xffff
+#define OECLS_NO_CPU 0xffff
+
+struct oecls_netdev_queue_info {
+ int irq;
+ int affinity_cpu;
+};
+
+struct oecls_netdev_info {
+ char dev_name[IFNAMSIZ];
+ struct net_device *netdev;
+ int rxq_num;
+ struct oecls_netdev_queue_info rxq[OECLS_MAX_RXQ_NUM_PER_DEV];
+ int old_filter_state;
+};
+
+struct oecls_rxq {
+ int rxq_id;
+ int status;
+};
+
+struct oecls_numa_clusterinfo {
+ int cluster_id;
+ int cur_freeidx;
+ struct oecls_rxq rxqs[OECLS_MAX_RXQ_NUM_PER_DEV];
+};
+
+struct oecls_numa_bound_dev_info {
+ DECLARE_BITMAP(bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ struct oecls_numa_clusterinfo *cluster_info;
+};
+
+struct oecls_numa_info {
+ DECLARE_BITMAP(avail_cpus, OECLS_MAX_CPU_NUM);
+ struct oecls_numa_bound_dev_info bound_dev[OECLS_MAX_NETDEV_NUM];
+};
+
+struct cmd_context {
+ char netdev[IFNAMSIZ];
+ u32 dip4;
+ u16 dport;
+ u16 action;
+ u32 ruleid;
+ u32 del_ruleid;
+ int ret_loc;
+};
+
+#define OECLS_SK_RULE_HASHSIZE 256
+#define OECLS_SK_RULE_HASHMASK (OECLS_SK_RULE_HASHSIZE - 1)
+
+struct oecls_sk_rule_list {
+ struct hlist_head hash[OECLS_SK_RULE_HASHSIZE];
+ /* Mutex to synchronize access to ntuple rule locking */
+ struct mutex mutex;
+};
+
+struct oecls_sk_rule {
+ struct hlist_node node;
+ int devid;
+ void *sk;
+ int dip4;
+ int dport;
+ int action;
+ int ruleid;
+ int nid;
+};
+
+struct oecls_sk_entry {
+ struct hlist_node node;
+ void *sk;
+ u32 sk_rule_hash;
+};
+
+struct oecls_dev_flow {
+ unsigned short cpu;
+ unsigned short filter;
+ unsigned int last_qtail;
+ int isvalid;
+ unsigned long timeout;
+};
+
+struct oecls_dev_flow_table {
+ unsigned int mask;
+ struct rcu_head rcu;
+ struct oecls_dev_flow flows[];
+};
+
+struct oecls_sock_flow_table {
+ u32 mask;
+ u32 ents[] ____cacheline_aligned_in_smp;
+};
+
+#define OECLS_DEV_FLOW_TABLE_NUM 0x1000
+#define OECLS_SOCK_FLOW_TABLE_NUM 0x100000
+#define OECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct oecls_dev_flow_table) + \
+ ((_num) * sizeof(struct oecls_dev_flow)))
+#define OECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct oecls_sock_flow_table, ents[_num]))
+
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+ NETIF_F_RXHASH)
+
+struct rmgr_ctrl {
+ int driver_select;
+ unsigned long *slot;
+ __u32 n_rules;
+ __u32 size;
+};
+
+extern int match_ip_flag;
+extern int debug;
+extern int oecls_netdev_num;
+extern int oecls_numa_num;
+
+#define oecls_debug(fmt, ...) \
+ do { \
+ if (debug) \
+ trace_printk(fmt, ## __VA_ARGS__); \
+ } while (0)
+
+#define oecls_error(fmt, ...) \
+ do { \
+ pr_err("oenetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \
+ trace_printk(fmt, ## __VA_ARGS__); \
+ } while (0)
+
+struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index);
+
+#define for_each_oecls_netdev(devid, oecls_dev) \
+ for (devid = 0, oecls_dev = get_oecls_netdev_info(devid); \
+ (devid < oecls_netdev_num) && oecls_dev; \
+ devid++, oecls_dev = get_oecls_netdev_info(devid))
+
+struct oecls_numa_info *get_oecls_numa_info(unsigned int nid);
+
+#define for_each_oecls_numa(nid, numa_info) \
+ for (nid = 0, numa_info = get_oecls_numa_info(nid); \
+ (nid < oecls_numa_num) && numa_info; \
+ nid++, numa_info = get_oecls_numa_info(nid))
+
+#ifdef CONFIG_ARM64_SVE
+void *__memcpy_aarch64_sve(void *, const void *, size_t);
+#define memcpy_r(dst, src, len) \
+ do { \
+ if (system_supports_sve()) \
+ __memcpy_aarch64_sve(dst, src, len); \
+ else \
+ memcpy(dst, src, len); \
+ } while (0)
+#else
+#define memcpy_r(dst, src, len) memcpy(dst, src, len)
+#endif
+
+int check_appname(char *task_name);
+int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd);
+int alloc_rxq_id(int nid, int devid);
+void free_rxq_id(int nid, int devid, int rxq_id);
+void oecls_ntuple_res_init(void);
+void oecls_ntuple_res_clean(void);
+void oecls_flow_res_init(void);
+void oecls_flow_res_clean(void);
+
+#endif /* _NET_OENETCLS_H */
diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c
new file mode 100644
index 000000000000..5dc58e8bae25
--- /dev/null
+++ b/net/oenetcls/oenetcls_flow.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/inet.h>
+#include <net/netdev_rx_queue.h>
+#include <net/sock.h>
+#include <trace/hooks/oenetcls.h>
+#include "oenetcls.h"
+
+static u32 oecls_cpu_mask;
+static struct oecls_sock_flow_table __rcu *oecls_sock_flow_table;
+static DEFINE_MUTEX(oecls_sock_flow_mutex);
+static DEFINE_SPINLOCK(oecls_dev_flow_lock);
+
+bool is_oecls_config_netdev(const char *name)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info)
+ if (strcmp(netdev_info->dev_name, name) == 0)
+ return true;
+
+ return false;
+}
+
+static void oecls_timeout(void *data, struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id, bool *ret)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct oecls_dev_flow_table *flow_table;
+ struct oecls_dev_flow *rflow;
+ bool expire = true;
+ unsigned int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->oecls_ftb);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = READ_ONCE(rflow->cpu);
+ oecls_debug("dev:%s, rxq:%d, flow_id:%u, filter_id:%d/%d, cpu:%d\n", dev->name,
+ rxq_index, flow_id, filter_id, rflow->filter, cpu);
+
+ if (rflow->filter == filter_id && cpu < nr_cpu_ids) {
+ if (time_before(jiffies, rflow->timeout + OECLS_TIMEOUT)) {
+ expire = false;
+ } else {
+ rflow->isvalid = 0;
+ WRITE_ONCE(rflow->cpu, OECLS_NO_CPU);
+ }
+ }
+ }
+ rcu_read_unlock();
+ oecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__,
+ dev->name, rxq_index, flow_id, filter_id, expire);
+ *ret = expire;
+}
+
+static void oecls_flow_update(void *data, struct sock *sk)
+{
+ struct oecls_sock_flow_table *tb;
+ unsigned int hash, index;
+ u32 val;
+ u32 cpu = raw_smp_processor_id();
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return;
+
+ if (check_appname(current->comm))
+ return;
+
+ rcu_read_lock();
+ tb = rcu_dereference(oecls_sock_flow_table);
+ hash = READ_ONCE(sk->sk_rxhash);
+ if (tb && hash) {
+ index = hash & tb->mask;
+ val = hash & ~oecls_cpu_mask;
+ val |= cpu;
+
+ if (READ_ONCE(tb->ents[index]) != val) {
+ WRITE_ONCE(tb->ents[index], val);
+
+ oecls_debug("[%s] sk:%p, hash:0x%x, index:0x%x, val:0x%x, cpu:%d\n",
+ current->comm, sk, hash, index, val, cpu);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+ u32 hash, index;
+ struct oecls_numa_info *numa_info;
+ struct oecls_numa_bound_dev_info *bound_dev = NULL;
+ int rxq_id, rxq_num, i;
+
+ numa_info = get_oecls_numa_info(nid);
+ if (!numa_info)
+ return -1;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info) {
+ if (strcmp(netdev_info->dev_name, dev->name) == 0) {
+ bound_dev = &numa_info->bound_dev[netdev_loop];
+ break;
+ }
+ }
+
+ if (!bound_dev)
+ return -1;
+ rxq_num = bitmap_weight(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ if (rxq_num == 0)
+ return -1;
+
+ hash = skb_get_hash(skb);
+ index = hash % rxq_num;
+
+ i = 0;
+ for_each_set_bit(rxq_id, bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV)
+ if (index == i++)
+ return rxq_id;
+
+ return -1;
+}
+
+static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct oecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu)
+{
+ struct netdev_rx_queue *rxqueue;
+ struct oecls_dev_flow_table *dtb;
+ struct oecls_dev_flow *rflow;
+ u32 flow_id, hash;
+ u16 rxq_index;
+ int rc;
+
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ return;
+
+ rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb);
+ if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0)
+ return;
+
+ rxqueue = dev->_rx + rxq_index;
+ dtb = rcu_dereference(rxqueue->oecls_ftb);
+ if (!dtb)
+ return;
+
+ hash = skb_get_hash(skb);
+ flow_id = hash & dtb->mask;
+ rflow = &dtb->flows[flow_id];
+ if (rflow->isvalid && rflow->cpu == next_cpu) {
+ rflow->timeout = jiffies;
+ return;
+ }
+
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id);
+ oecls_debug("skb:%p, rxq:%d, hash:0x%x, flow_id:%u, old_rxq_id:%d, next_cpu:%d, rc:%d\n",
+ skb, rxq_index, hash, flow_id, old_rxq_id, next_cpu, rc);
+ if (rc < 0)
+ return;
+
+ rflow->filter = rc;
+ rflow->isvalid = 1;
+ rflow->timeout = jiffies;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = OECLS_NO_FILTER;
+ rflow->cpu = next_cpu;
+}
+
+static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev,
+ struct oecls_sock_flow_table *tb, struct oecls_dev_flow_table *dtb,
+ int old_rxq_id)
+{
+ struct oecls_dev_flow *rflow;
+ u32 last_recv_cpu, hash, val;
+ u32 tcpu = 0;
+ u32 cpu = raw_smp_processor_id();
+
+ skb_reset_network_header(skb);
+ hash = skb_get_hash(skb);
+ if (!hash)
+ return;
+
+ val = READ_ONCE(tb->ents[hash & tb->mask]);
+ last_recv_cpu = val & oecls_cpu_mask;
+ rflow = &dtb->flows[hash & dtb->mask];
+ tcpu = rflow->cpu;
+
+ if ((val ^ hash) & ~oecls_cpu_mask)
+ return;
+
+ if (cpu_to_node(cpu) == cpu_to_node(last_recv_cpu))
+ return;
+
+ if (tcpu >= nr_cpu_ids)
+ set_oecls_cpu(ndev, skb, rflow, old_rxq_id, last_recv_cpu);
+}
+
+static void oecls_set_cpu(void *data, struct sk_buff *skb)
+{
+ struct net_device *ndev = skb->dev;
+ struct oecls_sock_flow_table *stb;
+ struct oecls_dev_flow_table *dtb;
+ struct netdev_rx_queue *rxqueue;
+ int rxq_id = -1;
+
+ if (!ndev)
+ return;
+
+ if (!is_oecls_config_netdev(ndev->name))
+ return;
+
+ rxqueue = ndev->_rx;
+ if (skb_rx_queue_recorded(skb)) {
+ rxq_id = skb_get_rx_queue(skb);
+ if (rxq_id >= ndev->real_num_rx_queues) {
+ oecls_debug("ndev:%s, rxq:%d, real_num:%d\n", ndev->name,
+ rxq_id, ndev->real_num_rx_queues);
+ return;
+ }
+ rxqueue += rxq_id;
+ }
+
+ // oecls_debug("skb:%px, dev:%s, rxq_id:%d\n", skb, ndev->name, rxq_id);
+ if (rxq_id < 0)
+ return;
+
+ rcu_read_lock();
+ stb = rcu_dereference(oecls_sock_flow_table);
+ dtb = rcu_dereference(rxqueue->oecls_ftb);
+ if (stb && dtb)
+ __oecls_set_cpu(skb, ndev, stb, dtb, rxq_id);
+
+ rcu_read_unlock();
+}
+
+static void oecls_dev_flow_table_free(struct rcu_head *rcu)
+{
+ struct oecls_dev_flow_table *table = container_of(rcu,
+ struct oecls_dev_flow_table, rcu);
+ vfree(table);
+}
+
+static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int qid)
+{
+ struct oecls_dev_flow_table *dtb;
+ struct netdev_rx_queue *queue;
+ int i;
+
+ spin_lock(&oecls_dev_flow_lock);
+ for (i = 0; i < qid; i++) {
+ queue = netdev->_rx + i;
+ dtb = rcu_dereference_protected(queue->oecls_ftb,
+ lockdep_is_held(&oecls_dev_flow_lock));
+ rcu_assign_pointer(queue->oecls_ftb, NULL);
+ }
+ spin_unlock(&oecls_dev_flow_lock);
+ call_rcu(&dtb->rcu, oecls_dev_flow_table_free);
+}
+
+static int oecls_dev_flow_table_release(void)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+ struct net_device *netdev;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info) {
+ netdev = netdev_info->netdev;
+ if (!netdev)
+ continue;
+ oecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues);
+ }
+
+ return 0;
+}
+
+static int _oecls_dev_flow_table_init(struct net_device *netdev)
+{
+ struct oecls_dev_flow_table *table;
+ int size = OECLS_DEV_FLOW_TABLE_NUM;
+ struct netdev_rx_queue *queue;
+ int i, j, ret = 0;
+
+ size = roundup_pow_of_two(size);
+ oecls_debug("dev:%s, num_rx_queues:%d, mask:0x%x\n", netdev->name, netdev->num_rx_queues,
+ size - 1);
+
+ for (i = 0; i < netdev->num_rx_queues; i++) {
+ table = vmalloc(OECLS_DEV_FLOW_TABLE_SIZE(size));
+ if (!table) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ table->mask = size - 1;
+ for (j = 0; j < size; j++) {
+ table->flows[j].cpu = OECLS_NO_CPU;
+ table->flows[j].isvalid = 0;
+ }
+
+ queue = netdev->_rx + i;
+
+ spin_lock(&oecls_dev_flow_lock);
+ rcu_assign_pointer(queue->oecls_ftb, table);
+ spin_unlock(&oecls_dev_flow_lock);
+ }
+ return ret;
+fail:
+ oecls_dev_flow_table_cleanup(netdev, i);
+ return ret;
+}
+
+static int oecls_dev_flow_table_init(void)
+{
+ struct oecls_netdev_info *netdev_info;
+ int netdev_loop;
+ struct net_device *ndev;
+ int i, err;
+
+ for_each_oecls_netdev(netdev_loop, netdev_info) {
+ ndev = netdev_info->netdev;
+ if (!ndev)
+ continue;
+ err = _oecls_dev_flow_table_init(ndev);
+ if (err)
+ goto out;
+ }
+
+ return 0;
+out:
+ for (i = 0; i < netdev_loop; i++) {
+ netdev_info = get_oecls_netdev_info(i);
+ ndev = netdev_info->netdev;
+ if (!ndev)
+ continue;
+ oecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues);
+ }
+ return err;
+}
+
+static int oecls_sock_flow_table_release(void)
+{
+ struct oecls_sock_flow_table *tb;
+
+ mutex_lock(&oecls_sock_flow_mutex);
+ tb = rcu_dereference_protected(oecls_sock_flow_table,
+ lockdep_is_held(&oecls_sock_flow_mutex));
+ if (tb)
+ rcu_assign_pointer(oecls_sock_flow_table, NULL);
+ mutex_unlock(&oecls_sock_flow_mutex);
+ synchronize_rcu();
+ vfree(tb);
+
+ unregister_trace_oecls_flow_update(&oecls_flow_update, NULL);
+ unregister_trace_oecls_set_cpu(&oecls_set_cpu, NULL);
+ unregister_trace_oecls_timeout(&oecls_timeout, NULL);
+ return 0;
+}
+
+static int oecls_sock_flow_table_init(void)
+{
+ struct oecls_sock_flow_table *table;
+ int size = OECLS_SOCK_FLOW_TABLE_NUM;
+ int i;
+
+ size = roundup_pow_of_two(size);
+ table = vmalloc(OECLS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!table)
+ return -ENOMEM;
+
+ oecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
+ oecls_debug("nr_cpu_ids:%d, oecls_cpu_mask:0x%x\n", nr_cpu_ids, oecls_cpu_mask);
+
+ table->mask = size - 1;
+ for (i = 0; i < size; i++)
+ table->ents[i] = OECLS_NO_CPU;
+
+ mutex_lock(&oecls_sock_flow_mutex);
+ rcu_assign_pointer(oecls_sock_flow_table, table);
+ mutex_unlock(&oecls_sock_flow_mutex);
+
+ register_trace_oecls_flow_update(oecls_flow_update, NULL);
+ register_trace_oecls_set_cpu(&oecls_set_cpu, NULL);
+ register_trace_oecls_timeout(&oecls_timeout, NULL);
+ return 0;
+}
+
+void oecls_flow_res_init(void)
+{
+ oecls_sock_flow_table_init();
+ oecls_dev_flow_table_init();
+}
+
+void oecls_flow_res_clean(void)
+{
+ oecls_sock_flow_table_release();
+ oecls_dev_flow_table_release();
+}
diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c
new file mode 100644
index 000000000000..67c73f4595be
--- /dev/null
+++ b/net/oenetcls/oenetcls_main.c
@@ -0,0 +1,1076 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netdev_features.h>
+#include <linux/ethtool.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/rtnetlink.h>
+#include "oenetcls.h"
+
+int oecls_netdev_num;
+static struct oecls_netdev_info oecls_netdev_info_table[OECLS_MAX_NETDEV_NUM];
+
+int oecls_numa_num;
+static int oecls_cluster_cpu_num, oecls_cluster_per_numa;
+static struct oecls_numa_info *oecls_numa_info_table;
+
+int debug;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "debug switch");
+
+static int mode;
+module_param(mode, int, 0444);
+MODULE_PARM_DESC(mode, "mode, default 0");
+
+static char ifname[64] = { 0 };
+module_param_string(ifname, ifname, sizeof(ifname), 0444);
+MODULE_PARM_DESC(ifname, "ifname");
+
+static char appname[64] = "redis-server";
+module_param_string(appname, appname, sizeof(appname), 0644);
+MODULE_PARM_DESC(appname, "appname, default redis-server");
+
+int match_ip_flag = 1;
+module_param(match_ip_flag, int, 0644);
+MODULE_PARM_DESC(match_ip_flag, "match ip flag");
+
+static int strategy;
+module_param(strategy, int, 0444);
+MODULE_PARM_DESC(strategy, "strategy, default 0");
+
+static bool check_params(void)
+{
+ if (mode != 0 && mode != 1)
+ return false;
+
+ if (strlen(ifname) == 0)
+ return false;
+
+ return true;
+}
+
+int check_appname(char *task_name)
+{
+ char *start = appname, *end;
+
+ if (!strlen(appname))
+ return 0;
+
+ // support appname: app1#app2#appN
+ while (*start != '\0') {
+ end = strchr(start, '#');
+ if (end == start) {
+ start++;
+ continue;
+ }
+
+ if (!end) {
+ if (!strncmp(task_name, start, strlen(start)))
+ return 0;
+ break;
+ }
+
+ if (!strncmp(task_name, start, end - start))
+ return 0;
+ start = end + 1;
+ }
+ return -EOPNOTSUPP;
+}
+
+static u32 __ethtool_get_flags(struct net_device *dev)
+{
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO)
+ flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+ flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+ flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE)
+ flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH)
+ flags |= ETH_FLAG_RXHASH;
+
+ return flags;
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
+ return -EINVAL;
+
+ if (data & ETH_FLAG_LRO)
+ features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_RX;
+ if (data & ETH_FLAG_TXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_TX;
+ if (data & ETH_FLAG_NTUPLE)
+ features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH)
+ features |= NETIF_F_RXHASH;
+
+ /* allow changing only bits set in hw_features */
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | (features & changed);
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+static void ethtool_rxnfc_copy_to_user(void *useraddr,
+ const struct ethtool_rxnfc *rxnfc,
+ size_t size, const u32 *rule_buf)
+{
+ memcpy_r(useraddr, rxnfc, size);
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+
+ if (rule_buf)
+ memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32));
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+ u32 cmd, void *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!dev->ethtool_ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ if (cmd == ETHTOOL_SRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ memcpy_r(&info, useraddr, info_size);
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS)
+ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+ u32 cmd, void *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ void *rule_buf = NULL;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ if (cmd == ETHTOOL_GRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ memcpy_r(&info, useraddr, info_size);
+
+ /* If FLOW_RSS was requested then user-space must be using the
+ * new definition, as FLOW_RSS is newer.
+ */
+ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
+ info_size = sizeof(info);
+ memcpy_r(&info, useraddr, info_size);
+ /* Since malicious users may modify the original data,
+ * we need to check whether FLOW_RSS is still requested.
+ */
+ if (!(info.flow_type & FLOW_RSS))
+ return -EINVAL;
+ }
+
+ if (info.cmd != cmd)
+ return -EINVAL;
+
+ if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+ if (info.rule_cnt > 0) {
+ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
+ rule_buf = kcalloc(info.rule_cnt, sizeof(u32),
+ GFP_KERNEL);
+ if (!rule_buf)
+ return -ENOMEM;
+ }
+ }
+
+ ret = ops->get_rxnfc(dev, &info, rule_buf);
+ if (ret < 0)
+ goto err_out;
+
+ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf);
+err_out:
+ kfree(rule_buf);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+ void *useraddr)
+{
+ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+
+ memcpy_r(useraddr, &channels, sizeof(channels));
+ return 0;
+}
+
+static int ethtool_get_value(struct net_device *dev, char *useraddr,
+ u32 cmd, u32 (*actor)(struct net_device *))
+{
+ struct ethtool_value edata = { .cmd = cmd };
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ edata.data = actor(dev);
+
+ memcpy_r(useraddr, &edata, sizeof(edata));
+ return 0;
+}
+
+static int ethtool_set_value(struct net_device *dev, char *useraddr,
+ int (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ memcpy_r(&edata, useraddr, sizeof(edata));
+
+ return actor(dev, edata.data);
+}
+
+static int dev_ethtool_kern(struct net *net, struct ifreq *ifr)
+{
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ void *useraddr = ifr->ifr_data;
+ u32 ethcmd, sub_cmd;
+ int rc;
+ netdev_features_t old_features;
+
+ if (!dev || !netif_device_present(dev))
+ return -ENODEV;
+
+ memcpy_r(ðcmd, useraddr, sizeof(ethcmd));
+
+ if (ethcmd == ETHTOOL_PERQUEUE)
+ memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd));
+ else
+ sub_cmd = ethcmd;
+
+ /* Allow some commands to be done by anyone */
+ switch (sub_cmd) {
+ case ETHTOOL_GFLAGS:
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GCHANNELS:
+ break;
+ default:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ }
+
+ if (dev->ethtool_ops->begin) {
+ rc = dev->ethtool_ops->begin(dev);
+ if (rc < 0)
+ return rc;
+ }
+ old_features = dev->features;
+
+ switch (ethcmd) {
+ case ETHTOOL_GFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ __ethtool_get_flags);
+ break;
+ case ETHTOOL_SFLAGS:
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
+ break;
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ case ETHTOOL_SRXCLSRLDEL:
+ case ETHTOOL_SRXCLSRLINS:
+ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_GCHANNELS:
+ rc = ethtool_get_channels(dev, useraddr);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (old_features != dev->features)
+ netdev_features_change(dev);
+
+ return rc;
+}
+
+int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd)
+{
+ struct ifreq ifr = {0};
+ int ret;
+
+ strncpy(ifr.ifr_name, ctx->netdev, sizeof(ctx->netdev));
+ ifr.ifr_data = cmd;
+
+ rtnl_lock();
+ ret = dev_ethtool_kern(&init_net, &ifr);
+ rtnl_unlock();
+
+ return ret;
+}
+
+struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index)
+{
+ if (index >= OECLS_MAX_NETDEV_NUM)
+ return NULL;
+ return &oecls_netdev_info_table[index];
+}
+
+static struct oecls_netdev_info *alloc_oecls_netdev_info(void)
+{
+ if (oecls_netdev_num >= OECLS_MAX_NETDEV_NUM)
+ return NULL;
+
+ return &oecls_netdev_info_table[oecls_netdev_num++];
+}
+
+static bool check_irq_name(const char *irq_name, struct oecls_netdev_info *oecls_dev)
+{
+ if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx"))
+ return false;
+
+ if (strstr(irq_name, oecls_dev->dev_name))
+ return true;
+
+ if (oecls_dev->netdev->dev.parent &&
+ strstr(irq_name, dev_name(oecls_dev->netdev->dev.parent)))
+ return true;
+
+ return false;
+}
+
+static void get_netdev_queue_info(struct oecls_netdev_info *oecls_dev)
+{
+ struct oecls_netdev_queue_info *rxq_info;
+ struct irq_desc *desc;
+ int irq, cpu;
+
+ for_each_irq_desc(irq, desc) {
+ if (!desc->action)
+ continue;
+ if (!desc->action->name)
+ continue;
+ if (!check_irq_name(desc->action->name, oecls_dev))
+ continue;
+ if (oecls_dev->rxq_num >= OECLS_MAX_RXQ_NUM_PER_DEV)
+ break;
+ rxq_info = &oecls_dev->rxq[oecls_dev->rxq_num++];
+ rxq_info->irq = irq;
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data));
+ rxq_info->affinity_cpu = cpu;
+ oecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n",
+ irq, desc->action->name, oecls_dev->rxq_num, cpu);
+ }
+}
+
+static int oecls_filter_enable(const char *dev_name, bool *old_state)
+{
+ struct ethtool_value eval = {0};
+ struct cmd_context ctx = {0};
+ int ret;
+
+ strncpy(ctx.netdev, dev_name, IFNAMSIZ);
+
+ eval.cmd = ETHTOOL_GFLAGS;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
+ return ret;
+ }
+ if (eval.data & ETH_FLAG_NTUPLE) {
+ *old_state = true;
+ oecls_debug("%s ntuple is already on\n", dev_name);
+ return 0;
+ }
+
+ // Set ntuple feature
+ eval.cmd = ETHTOOL_SFLAGS;
+ eval.data |= ETH_FLAG_NTUPLE;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("set %s flags fail, ret:%d\n", dev_name, ret);
+ return ret;
+ }
+
+ // Get ntuple feature
+ eval.cmd = ETHTOOL_GFLAGS;
+ eval.data = 0;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
+ return ret;
+ }
+ if (!(eval.data & ETH_FLAG_NTUPLE)) {
+ oecls_error("enable ntuple feature fail!\n");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void oecls_filter_restore(const char *dev_name, bool old_state)
+{
+ struct ethtool_value eval = {0};
+ struct cmd_context ctx = {0};
+ bool cur_filter_state;
+ int ret;
+
+ strncpy(ctx.netdev, dev_name, IFNAMSIZ);
+
+ eval.cmd = ETHTOOL_GFLAGS;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
+ return;
+ }
+
+ cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false;
+ if (cur_filter_state == old_state)
+ return;
+
+ // Set ntuple feature
+ eval.cmd = ETHTOOL_SFLAGS;
+ if (old_state)
+ eval.data |= ETH_FLAG_NTUPLE;
+ else
+ eval.data &= ~ETH_FLAG_NTUPLE;
+ ret = send_ethtool_ioctl(&ctx, &eval);
+ if (ret != 0) {
+ oecls_error("set %s flags fail, ret:%d\n", dev_name, ret);
+ return;
+ }
+}
+
+static int init_single_oecls_dev(char *if_name, unsigned int length)
+{
+ struct oecls_netdev_info *oecls_dev;
+ char dev_name[IFNAMSIZ] = { 0 };
+ struct net_device *netdev;
+ int cpy_len = length < IFNAMSIZ ? length : IFNAMSIZ;
+ bool old_state = false;
+ int ret;
+
+ strncpy(dev_name, if_name, cpy_len);
+ netdev = dev_get_by_name(&init_net, dev_name);
+ if (!netdev) {
+ oecls_error("dev [%s] is not exist!\n", dev_name);
+ return -ENODEV;
+ }
+
+ if (!(netdev->flags & IFF_UP)) {
+ ret = -ENETDOWN;
+ oecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags);
+ goto out;
+ }
+
+ if (netdev->flags & IFF_LOOPBACK) {
+ ret = -EOPNOTSUPP;
+ oecls_error("Do not support loopback.\n");
+ goto out;
+ }
+
+ ret = oecls_filter_enable(dev_name, &old_state);
+ if (ret) {
+ oecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret);
+ goto out;
+ }
+
+ oecls_dev = alloc_oecls_netdev_info();
+ if (!oecls_dev) {
+ ret = -ENOMEM;
+ oecls_filter_restore(dev_name, old_state);
+ oecls_error("alloc oecls_dev fail! oecls_netdev_num:%d\n", oecls_netdev_num);
+ goto out;
+ }
+
+ memcpy_r(oecls_dev->dev_name, dev_name, IFNAMSIZ);
+ oecls_dev->old_filter_state = old_state;
+ oecls_dev->netdev = netdev;
+ get_netdev_queue_info(oecls_dev);
+ return 0;
+
+out:
+ dev_put(netdev);
+ return ret;
+}
+
+static void clean_oecls_netdev_info(void)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct net_device *netdev;
+ int devid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ oecls_filter_restore(oecls_dev->dev_name, oecls_dev->old_filter_state);
+ netdev = oecls_dev->netdev;
+ if (netdev) {
+ oecls_dev->netdev = NULL;
+ dev_put(netdev);
+ }
+ }
+
+ oecls_netdev_num = 0;
+}
+
+static int init_oecls_netdev_info(char *netdev_str)
+{
+ char *start = netdev_str, *end;
+ int err = -ENODEV;
+
+ while (*start != '\0') {
+ // skip start #
+ end = strchr(start, '#');
+ if (end == start) {
+ start++;
+ continue;
+ }
+
+ // find the last ifname
+ if (!end) {
+ err = init_single_oecls_dev(start, strlen(start));
+ break;
+ }
+
+ err = init_single_oecls_dev(start, end - start);
+ if (err)
+ break;
+ start = end + 1;
+ }
+
+ return err;
+}
+
+struct oecls_numa_info *get_oecls_numa_info(unsigned int nid)
+{
+ if (nid >= oecls_numa_num)
+ return NULL;
+ return &oecls_numa_info_table[nid];
+}
+
+static void clean_oecls_numa_info(void)
+{
+ oecls_numa_num = 0;
+ kfree(oecls_numa_info_table);
+}
+
+static void init_numa_avail_cpus(int nid, struct oecls_numa_info *numa_info)
+{
+ int cpu;
+
+ oecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)),
+ cpumask_pr_args(cpumask_of_node(nid)));
+
+ bitmap_zero(numa_info->avail_cpus, OECLS_MAX_CPU_NUM);
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ if (cpu >= OECLS_MAX_CPU_NUM)
+ return;
+ set_bit(cpu, numa_info->avail_cpus);
+ }
+}
+
+static void clean_oecls_rxq(void)
+{
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_netdev_info *oecls_dev;
+ struct oecls_numa_info *numa_info;
+ int nid, devid;
+
+ for_each_oecls_numa(nid, numa_info) {
+ for_each_oecls_netdev(devid, oecls_dev) {
+ bound_dev = &numa_info->bound_dev[devid];
+ kfree(bound_dev->cluster_info);
+ }
+ }
+}
+
+static int init_numa_rxq_bitmap(int nid, struct oecls_numa_info *numa_info)
+{
+ int bound_rxq_num, cluster_id, cluster_idx, cur_idx;
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_netdev_info *oecls_dev;
+ int rxq_id, devid, cpu, ret = 0;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ bound_rxq_num = 0;
+ bound_dev = &numa_info->bound_dev[devid];
+ bitmap_zero(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ bound_dev->cluster_info = kcalloc(oecls_cluster_per_numa,
+ sizeof(*bound_dev->cluster_info), GFP_ATOMIC);
+ if (!bound_dev->cluster_info) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) {
+ cpu = oecls_dev->rxq[rxq_id].affinity_cpu;
+ if (cpu_to_node(cpu) == nid) {
+ set_bit(rxq_id, bound_dev->bitmap_rxq);
+ cluster_id = cpu / oecls_cluster_cpu_num;
+ cluster_idx = cluster_id % oecls_cluster_per_numa;
+ bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id;
+ cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++;
+ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id;
+ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 1;
+ bound_rxq_num++;
+ oecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n",
+ cpu, cluster_id, cluster_idx, rxq_id, cur_idx);
+ }
+ }
+
+ oecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bit_num:%d, bitmap_rxq:%*pbl\n",
+ nid, devid, oecls_dev->dev_name, oecls_dev->rxq_num,
+ bound_rxq_num, OECLS_MAX_RXQ_NUM_PER_DEV, bound_dev->bitmap_rxq);
+ }
+ return ret;
+
+out:
+ clean_oecls_rxq();
+ return ret;
+}
+
+static int get_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev)
+{
+ int cpu = smp_processor_id();
+ int cluster_id = cpu / oecls_cluster_cpu_num;
+ int i, j, rxq_id;
+
+ for (i = 0; i < oecls_cluster_per_numa; i++) {
+ if (cluster_id != bound_dev->cluster_info[i].cluster_id)
+ continue;
+ for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) {
+ if (bound_dev->cluster_info[i].rxqs[j].status == 1) {
+ bound_dev->cluster_info[i].rxqs[j].status = 2;
+ rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id;
+ oecls_debug("cluster:%d cpu:%d alloc rxq_id:%d\n",
+ cluster_id, cpu, rxq_id);
+ return rxq_id;
+ }
+ }
+ }
+ oecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu);
+ return -1;
+}
+
+static int put_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev, int rxq_id)
+{
+ int i, j;
+
+ for (i = 0; i < oecls_cluster_per_numa; i++) {
+ for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) {
+ if (bound_dev->cluster_info[i].rxqs[j].status == 2 &&
+ bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) {
+ bound_dev->cluster_info[i].rxqs[j].status = 1;
+ oecls_debug("free rxq_id:%d\n", rxq_id);
+ return 0;
+ }
+ }
+ }
+ oecls_debug("no match malloced rxq_id:%d\n", rxq_id);
+ return -1;
+}
+
+int alloc_rxq_id(int nid, int devid)
+{
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_numa_info *numa_info;
+ int rxq_id;
+
+ numa_info = get_oecls_numa_info(nid);
+ if (!numa_info) {
+ oecls_error("error nid:%d\n", nid);
+ return -EINVAL;
+ }
+
+ if (devid >= OECLS_MAX_NETDEV_NUM) {
+ oecls_error("error bound_dev index:%d\n", devid);
+ return -EINVAL;
+ }
+ bound_dev = &numa_info->bound_dev[devid];
+
+ if (strategy == 1) {
+ rxq_id = get_cluster_rxq(bound_dev);
+ if (rxq_id < 0 || rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV)
+ pr_info("failed to get rxq_id:%d in cluster, try numa\n", rxq_id);
+ else
+ goto found;
+ }
+
+ rxq_id = find_first_bit(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV);
+ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) {
+ oecls_error("error rxq_id:%d\n", rxq_id);
+ return -EINVAL;
+ }
+
+found:
+ clear_bit(rxq_id, bound_dev->bitmap_rxq);
+ oecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id);
+ return rxq_id;
+}
+
+void free_rxq_id(int nid, int devid, int rxq_id)
+{
+ struct oecls_numa_bound_dev_info *bound_dev;
+ struct oecls_numa_info *numa_info;
+
+ numa_info = get_oecls_numa_info(nid);
+ if (!numa_info) {
+ oecls_error("error nid:%d\n", nid);
+ return;
+ }
+
+ if (devid >= OECLS_MAX_NETDEV_NUM) {
+ oecls_error("error bound_dev index:%d\n", devid);
+ return;
+ }
+ bound_dev = &numa_info->bound_dev[devid];
+
+ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) {
+ oecls_error("error rxq_id:%d\n", rxq_id);
+ return;
+ }
+
+ if (strategy == 1)
+ put_cluster_rxq(bound_dev, rxq_id);
+
+ if (test_bit(rxq_id, bound_dev->bitmap_rxq)) {
+ oecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id);
+ return;
+ }
+
+ set_bit(rxq_id, bound_dev->bitmap_rxq);
+ oecls_debug("free nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id);
+}
+
+static int init_oecls_numa_info(void)
+{
+ struct oecls_numa_info *numa_info;
+ int nid, ret = 0;
+
+ oecls_numa_num = num_online_nodes();
+ oecls_numa_info_table = kcalloc(oecls_numa_num, sizeof(*oecls_numa_info_table),
+ GFP_ATOMIC);
+ if (!oecls_numa_info_table) {
+ ret = -ENOMEM;
+ oecls_error("oecls_numa_info_table alloc failed:%d\n", ret);
+ return ret;
+ }
+
+ oecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(smp_processor_id()));
+ oecls_cluster_per_numa = (nr_cpu_ids / oecls_cluster_cpu_num) / oecls_numa_num;
+ oecls_debug("oecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n",
+ oecls_numa_num, oecls_cluster_per_numa, oecls_cluster_cpu_num);
+
+ for_each_oecls_numa(nid, numa_info)
+ init_numa_avail_cpus(nid, numa_info);
+
+ return ret;
+}
+
+static int alloc_available_cpu(int nid, struct oecls_numa_info *numa_info)
+{
+ int cpu;
+
+ cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM);
+ if (cpu >= OECLS_MAX_CPU_NUM) {
+ oecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu);
+ return -1;
+ }
+
+ clear_bit(cpu, numa_info->avail_cpus);
+ return cpu;
+}
+
+static void add_netdev_irq_affinity_cpu(struct oecls_netdev_info *oecls_dev, int rxq_id, int cpu)
+{
+ struct oecls_netdev_queue_info *rxq_info;
+
+ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV)
+ return;
+
+ rxq_info = &oecls_dev->rxq[rxq_id];
+ rxq_info->affinity_cpu = cpu;
+}
+
+static void config_affinity_strategy_default(struct oecls_netdev_info *oecls_dev)
+{
+ struct oecls_numa_info *numa_info;
+ int rxq_num = oecls_dev->rxq_num;
+ int rxq_per_numa = rxq_num / oecls_numa_num;
+ int remain = rxq_num - rxq_per_numa * oecls_numa_num;
+ int numa_rxq_id, rxq_id, nid, cpu;
+
+ oecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", oecls_dev->dev_name,
+ rxq_num, rxq_per_numa, remain);
+
+ // average config rxq to every numa
+ for_each_oecls_numa(nid, numa_info) {
+ for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) {
+ cpu = alloc_available_cpu(nid, numa_info);
+ if (cpu < 0)
+ break;
+
+ rxq_id = rxq_per_numa * nid + numa_rxq_id;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu);
+ }
+ }
+
+ if (!remain)
+ return;
+
+ // config remain rxq to every numa
+ numa_rxq_id = 0;
+ for_each_oecls_numa(nid, numa_info) {
+ if (numa_rxq_id >= remain)
+ break;
+ cpu = alloc_available_cpu(nid, numa_info);
+ if (cpu < 0)
+ break;
+
+ rxq_id = rxq_per_numa * oecls_numa_num + numa_rxq_id;
+ numa_rxq_id++;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu);
+ }
+}
+
+static void config_affinity_strategy_cluster(struct oecls_netdev_info *oecls_dev)
+{
+ int rxq_num = oecls_dev->rxq_num;
+ int rxq_per_numa = rxq_num / oecls_numa_num;
+ int remain = rxq_num - rxq_per_numa * oecls_numa_num;
+ int cpu_idx = oecls_cluster_cpu_num - 1;
+ int cluster, cpu, rxq_id = 0, round;
+
+ round = rxq_per_numa < oecls_cluster_per_numa ? rxq_per_numa : oecls_cluster_per_numa;
+ if (remain > 0)
+ round++;
+ oecls_debug("round=%d\n", round);
+
+ while (rxq_id < oecls_dev->rxq_num) {
+ for (cluster = 0; cluster < oecls_cluster_per_numa * oecls_numa_num; cluster++) {
+ if (cluster % oecls_cluster_per_numa >= round)
+ continue;
+ cpu = cluster * oecls_cluster_cpu_num + cpu_idx;
+ if (rxq_id >= oecls_dev->rxq_num)
+ break;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu);
+ }
+ cpu_idx--;
+ if (--cpu_idx < 0)
+ cpu_idx = oecls_cluster_cpu_num - 1;
+ }
+}
+
+static void config_affinity_strategy_numa(struct oecls_netdev_info *oecls_dev)
+{
+ int rxq_num = oecls_dev->rxq_num;
+ int rxq_per_numa = rxq_num / oecls_numa_num;
+ int cpu_per_numa = nr_cpu_ids / oecls_numa_num;
+ int remain = rxq_num - rxq_per_numa * oecls_numa_num;
+ struct oecls_numa_info *numa_info;
+ int numa_start_cpu, numa_cpu_id;
+ int rxq_id = 0, nid, cpu;
+
+ for_each_oecls_numa(nid, numa_info) {
+ numa_start_cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM);
+ for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) {
+ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa);
+ if (rxq_id >= oecls_dev->rxq_num)
+ break;
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu);
+ }
+ if (remain-- > 0) {
+ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa);
+ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu);
+ }
+ }
+}
+
+static void config_affinity_strategy_custom(struct oecls_netdev_info *oecls_dev)
+{
+ oecls_debug("dev=%s\n", oecls_dev->dev_name);
+}
+
+static void config_affinity_strategy(void)
+{
+ struct oecls_netdev_info *oecls_dev;
+ int devid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ switch (strategy) {
+ case 1:
+ config_affinity_strategy_cluster(oecls_dev);
+ break;
+ case 2:
+ config_affinity_strategy_numa(oecls_dev);
+ break;
+ case 3:
+ config_affinity_strategy_custom(oecls_dev);
+ break;
+ case 0:
+ default:
+ config_affinity_strategy_default(oecls_dev);
+ break;
+ }
+ }
+}
+
+static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu)
+{
+ int err = 0;
+
+ err = irq_set_affinity(irq, get_cpu_mask(cpu));
+ oecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err);
+}
+
+static void enable_affinity_strategy(void)
+{
+ struct oecls_netdev_queue_info *rxq_info;
+ struct oecls_netdev_info *oecls_dev;
+ int rxq_id, devid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) {
+ rxq_info = &oecls_dev->rxq[rxq_id];
+ irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu);
+ }
+ }
+}
+
+static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id,
+ const struct cpumask *cpu_mask)
+{
+ int err = 0;
+
+ err = netif_set_xps_queue(netdev, cpu_mask, rxq_id);
+ oecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id,
+ cpumask_pr_args(cpu_mask), err);
+}
+
+static void set_netdev_xps_queue(bool enable)
+{
+ const struct cpumask clear_mask = { 0 };
+ struct oecls_netdev_info *oecls_dev;
+ const struct cpumask *cpu_mask;
+ int rxq_id, devid, cpu, nid;
+
+ for_each_oecls_netdev(devid, oecls_dev) {
+ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) {
+ cpu = oecls_dev->rxq[rxq_id].affinity_cpu;
+ nid = cpu_to_node(cpu);
+ if (enable)
+ cpu_mask = cpumask_of_node(nid);
+ else
+ cpu_mask = &clear_mask;
+
+ netif_set_xps_queue_wrapper(oecls_dev->netdev, rxq_id, cpu_mask);
+ }
+ }
+}
+
+static __init int oecls_init(void)
+{
+ struct oecls_numa_info *numa_info;
+ int nid, err;
+
+ if (!check_params())
+ return -EINVAL;
+
+ err = init_oecls_numa_info();
+ if (err)
+ return err;
+
+ err = init_oecls_netdev_info(ifname);
+ if (err)
+ goto clean_numa;
+
+ // Set irq affinity
+ config_affinity_strategy();
+ enable_affinity_strategy();
+
+ // Calculate rxq bounded to one numa
+ for_each_oecls_numa(nid, numa_info) {
+ err = init_numa_rxq_bitmap(nid, numa_info);
+ if (err)
+ goto clean_rxq;
+ }
+
+#ifdef CONFIG_XPS
+ set_netdev_xps_queue(true);
+#endif
+
+ if (mode == 0)
+ oecls_ntuple_res_init();
+ else
+ oecls_flow_res_init();
+
+ return 0;
+
+clean_rxq:
+clean_numa:
+ clean_oecls_netdev_info();
+ clean_oecls_numa_info();
+ return err;
+}
+
+static __exit void oecls_exit(void)
+{
+ if (mode == 0)
+ oecls_ntuple_res_clean();
+ else
+ oecls_flow_res_clean();
+
+#ifdef CONFIG_XPS
+ set_netdev_xps_queue(false);
+#endif
+
+ clean_oecls_rxq();
+ clean_oecls_netdev_info();
+ clean_oecls_numa_info();
+}
+
+module_init(oecls_init);
+module_exit(oecls_exit);
+
+MODULE_DESCRIPTION("oenetcls");
+MODULE_LICENSE("GPL v2");
diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c
new file mode 100644
index 000000000000..3986d86efe83
--- /dev/null
+++ b/net/oenetcls/oenetcls_ntuple.c
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/inetdevice.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/inet.h>
+#include <linux/jhash.h>
+#include <net/sock.h>
+#include <trace/hooks/oenetcls.h>
+#include "oenetcls.h"
+
+struct oecls_sk_rule_list oecls_sk_rules, oecls_sk_list;
+
+static void init_oecls_sk_rules(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++)
+ INIT_HLIST_HEAD(oecls_sk_rules.hash + i);
+ mutex_init(&oecls_sk_rules.mutex);
+}
+
+static inline struct hlist_head *get_rule_hashlist(u32 dip4, u16 dport)
+{
+ return oecls_sk_rules.hash + (jhash_2words(dip4, dport, 0) & OECLS_SK_RULE_HASHMASK);
+}
+
+static inline struct hlist_head *get_sk_hashlist(void *sk)
+{
+ return oecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & OECLS_SK_RULE_HASHMASK);
+}
+
+static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action,
+ int ruleid, int nid)
+{
+ struct hlist_head *hlist = get_rule_hashlist(dip4, dport);
+ struct hlist_head *sk_hlist = get_sk_hashlist(sk);
+ struct oecls_sk_rule *rule;
+ struct oecls_sk_entry *entry;
+
+ rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!rule || !entry)
+ goto out;
+
+ rule->sk = sk;
+ rule->dip4 = dip4;
+ rule->dport = dport;
+ rule->devid = devid;
+ rule->action = action;
+ rule->ruleid = ruleid;
+ rule->nid = nid;
+ hlist_add_head(&rule->node, hlist);
+
+ entry->sk = sk;
+ entry->sk_rule_hash = jhash_2words(dip4, dport, 0);
+ hlist_add_head(&entry->node, sk_hlist);
+ return;
+out:
+ oecls_debug("alloc failed rule:%p entry:%p\n", rule, entry);
+ kfree(entry);
+ kfree(rule);
+}
+
+static struct oecls_sk_entry *get_sk_entry(void *sk)
+{
+ struct hlist_head *sk_hlist = get_sk_hashlist(sk);
+ struct oecls_sk_entry *entry = NULL;
+
+ hlist_for_each_entry(entry, sk_hlist, node) {
+ if (entry->sk == sk)
+ break;
+ }
+ return entry;
+}
+
+static void del_sk_rule(struct oecls_sk_rule *rule)
+{
+ struct oecls_sk_entry *entry;
+
+ entry = get_sk_entry(rule->sk);
+ if (!entry)
+ return;
+ hlist_del_init(&entry->node);
+ kfree(entry);
+
+ oecls_debug("del rule=%p\n", rule);
+ hlist_del_init(&rule->node);
+ kfree(rule);
+}
+
+static struct oecls_sk_rule *get_sk_rule(int devid, u32 dip4, u16 dport)
+{
+ struct hlist_head *hlist = get_rule_hashlist(dip4, dport);
+ struct oecls_sk_rule *rule = NULL;
+
+ hlist_for_each_entry(rule, hlist, node) {
+ if (rule->devid == devid && rule->dip4 == dip4 && rule->dport == dport)
+ break;
+ }
+ return rule;
+}
+
+static struct oecls_sk_rule *get_rule_from_sk(int devid, void *sk)
+{
+ struct oecls_sk_rule *rule = NULL;
+ struct oecls_sk_entry *entry;
+ struct hlist_head *hlist;
+
+ entry = get_sk_entry(sk);
+ if (!entry)
+ return NULL;
+
+ hlist = oecls_sk_rules.hash + (entry->sk_rule_hash & OECLS_SK_RULE_HASHMASK);
+ hlist_for_each_entry(rule, hlist, node) {
+ if (rule->devid == devid && rule->sk == sk)
+ break;
+ }
+ return rule;
+}
+
+static inline bool reuseport_check(int devid, u32 dip4, u16 dport)
+{
+ return !!get_sk_rule(devid, dip4, dport);
+}
+
+static u32 get_first_ip4_addr(struct net *net)
+{
+ struct in_device *in_dev;
+ struct net_device *dev;
+ struct in_ifaddr *ifa;
+ u32 dip4 = 0;
+
+ rtnl_lock();
+ rcu_read_lock();
+ for_each_netdev(net, dev) {
+ if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP))
+ continue;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (!strcmp(dev->name, ifa->ifa_label)) {
+ dip4 = ifa->ifa_local;
+ oecls_debug("dev: %s, dip4:%pI4\n", dev->name, &dip4);
+ goto out;
+ }
+ }
+ }
+out:
+ rcu_read_unlock();
+ rtnl_unlock();
+ return dip4;
+}
+
+static void get_sk_rule_addr(struct sock *sk, u32 *dip4, u16 *dport)
+{
+ *dport = htons(sk->sk_num);
+
+ if (!match_ip_flag) {
+ *dip4 = 0;
+ return;
+ }
+
+ if (sk->sk_rcv_saddr)
+ *dip4 = sk->sk_rcv_saddr;
+ else
+ *dip4 = get_first_ip4_addr(sock_net(sk));
+}
+
+static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc)
+{
+ struct ethtool_rxnfc nfccmd;
+ int err;
+
+ nfccmd.cmd = ETHTOOL_SRXCLSRLDEL;
+ nfccmd.fs.location = loc;
+ err = send_ethtool_ioctl(ctx, &nfccmd);
+ if (err < 0)
+ oecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc);
+ return err;
+}
+
+static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc)
+{
+ if (loc >= rmgr->size) {
+ oecls_error("rmgr: Location out of range\n");
+ return -1;
+ }
+
+ set_bit(loc, rmgr->slot);
+ return 0;
+}
+
+static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp)
+{
+ __u32 loc, slot_num;
+
+ if (rmgr->driver_select)
+ return 0;
+
+ loc = rmgr->size - 1;
+ slot_num = loc / BITS_PER_LONG;
+ if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) {
+ loc -= 1 + (loc % BITS_PER_LONG);
+ slot_num--;
+ }
+
+ while (loc < rmgr->size && !~(rmgr->slot[slot_num])) {
+ loc -= BITS_PER_LONG;
+ slot_num--;
+ }
+
+ while (loc < rmgr->size && test_bit(loc, rmgr->slot))
+ loc--;
+
+ if (loc < rmgr->size) {
+ fsp->location = loc;
+ return rmgr_ins(rmgr, loc);
+ }
+
+ return -1;
+}
+
+static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select)
+{
+ struct ethtool_rxnfc nfccmd;
+ int err;
+
+ nfccmd.cmd = ETHTOOL_GRXCLSRLCNT;
+ nfccmd.data = 0;
+ err = send_ethtool_ioctl(ctx, &nfccmd);
+ *count = nfccmd.rule_cnt;
+ if (driver_select)
+ *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL);
+ if (err < 0)
+ oecls_debug("rxclass: Cannot get RX class rule count\n");
+
+ return err;
+}
+
+static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr)
+{
+ struct ethtool_rxnfc *nfccmd;
+ __u32 *rule_locs;
+ int i, err = 0;
+
+ memset(rmgr, 0, sizeof(*rmgr));
+ err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select);
+ if (err < 0)
+ return err;
+
+ if (rmgr->driver_select)
+ return err;
+
+ nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC);
+ if (!nfccmd) {
+ oecls_error("rmgr: Cannot allocate memory for RX class rule locations\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ nfccmd->cmd = ETHTOOL_GRXCLSRLALL;
+ nfccmd->rule_cnt = rmgr->n_rules;
+ err = send_ethtool_ioctl(ctx, nfccmd);
+ if (err < 0) {
+ oecls_debug("rmgr: Cannot get RX class rules\n");
+ goto out;
+ }
+
+ rmgr->size = nfccmd->data;
+ if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) {
+ oecls_error("rmgr: Invalid RX class rules table size\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC);
+ if (!rmgr->slot) {
+ oecls_error("rmgr: Cannot allocate memory for RX class rules\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ rule_locs = nfccmd->rule_locs;
+ for (i = 0; i < rmgr->n_rules; i++) {
+ err = rmgr_ins(rmgr, rule_locs[i]);
+ if (err < 0)
+ break;
+ }
+
+out:
+ kfree(nfccmd);
+ return err;
+}
+
+static void rmgr_cleanup(struct rmgr_ctrl *rmgr)
+{
+ kfree(rmgr->slot);
+ rmgr->slot = NULL;
+ rmgr->size = 0;
+}
+
+static int rmgr_set_location(struct cmd_context *ctx,
+ struct ethtool_rx_flow_spec *fsp)
+{
+ struct rmgr_ctrl rmgr;
+ int ret;
+
+ ret = rmgr_init(ctx, &rmgr);
+ if (ret < 0)
+ goto out;
+
+ ret = rmgr_find_empty_slot(&rmgr, fsp);
+out:
+ rmgr_cleanup(&rmgr);
+ return ret;
+}
+
+static int rxclass_rule_ins(struct cmd_context *ctx,
+ struct ethtool_rx_flow_spec *fsp, u32 rss_context)
+{
+ struct ethtool_rxnfc nfccmd;
+ u32 loc = fsp->location;
+ int ret;
+
+ if (loc & RX_CLS_LOC_SPECIAL) {
+ ret = rmgr_set_location(ctx, fsp);
+ if (ret < 0)
+ return ret;
+ }
+
+ nfccmd.cmd = ETHTOOL_SRXCLSRLINS;
+ nfccmd.rss_context = rss_context;
+ nfccmd.fs = *fsp;
+ ret = send_ethtool_ioctl(ctx, &nfccmd);
+ if (ret < 0) {
+ oecls_debug("Can not insert the clasification rule\n");
+ return ret;
+ }
+
+ if (loc & RX_CLS_LOC_SPECIAL)
+ oecls_debug("Added rule with ID %d\n", nfccmd.fs.location);
+
+ return 0;
+}
+
+static void flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp,
+ struct ethtool_rx_ntuple_flow_spec *ntuple)
+{
+ int i;
+
+ memset(ntuple, ~0, sizeof(*ntuple));
+ ntuple->flow_type = fsp->flow_type;
+ ntuple->action = fsp->ring_cookie;
+ memcpy_r(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u));
+ memcpy_r(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u));
+ for (i = 0; i < sizeof(ntuple->m_u); i++)
+ ntuple->m_u.hdata[i] ^= 0xFF;
+ ntuple->flow_type &= ~FLOW_EXT;
+}
+
+static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fsp)
+{
+ struct ethtool_rx_ntuple ntuplecmd;
+ struct ethtool_value eval;
+ int ret = 0;
+
+ flow_spec_to_ntuple(fsp, &ntuplecmd.fs);
+
+ eval.cmd = ETHTOOL_GFLAGS;
+ ret = send_ethtool_ioctl(ctx, &eval);
+ if (ret || !(eval.data & ETH_FLAG_NTUPLE))
+ return -1;
+
+ ntuplecmd.cmd = ETHTOOL_SRXNTUPLE;
+ ret = send_ethtool_ioctl(ctx, &ntuplecmd);
+ if (ret)
+ oecls_debug("Cannot add new rule via N-tuple, ret:%d\n", ret);
+
+ return ret;
+}
+
+static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del)
+{
+ struct ethtool_rx_flow_spec *fsp, rx_rule_fs;
+ u32 rss_context = 0;
+ int ret;
+
+ oecls_debug("is_del:%d netdev:%s, dip4:%pI4, dport:%d, action:%d, ruleid:%u, del_ruleid:%u\n",
+ is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, ctx->ruleid,
+ ctx->del_ruleid);
+
+ if (is_del)
+ return rxclass_rule_del(ctx, ctx->del_ruleid);
+
+ ctx->ret_loc = -1;
+
+ fsp = &rx_rule_fs;
+ memset(fsp, 0, sizeof(*fsp));
+ fsp->flow_type = TCP_V4_FLOW;
+ fsp->location = RX_CLS_LOC_ANY;
+ fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4;
+ fsp->h_u.tcp_ip4_spec.pdst = ctx->dport;
+ if (ctx->dip4)
+ fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL;
+ fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL;
+ if (ctx->ruleid)
+ fsp->location = ctx->ruleid;
+ fsp->ring_cookie = ctx->action;
+
+ ret = do_srxntuple(ctx, &rx_rule_fs);
+ if (!ret)
+ return 0;
+
+ ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context);
+ if (!ret)
+ ctx->ret_loc = rx_rule_fs.location;
+ return ret;
+}
+
+static void del_ntuple_rule(struct sock *sk)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct cmd_context ctx = { 0 };
+ struct oecls_sk_rule *rule;
+ int devid;
+ u16 dport;
+ u32 dip4;
+ int err;
+
+ get_sk_rule_addr(sk, &dip4, &dport);
+
+ mutex_lock(&oecls_sk_rules.mutex);
+ for_each_oecls_netdev(devid, oecls_dev) {
+ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ);
+ rule = get_rule_from_sk(devid, sk);
+ if (!rule) {
+ oecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n",
+ sk, devid, &dip4, ntohs(dport));
+ continue;
+ }
+
+ // Config Ntuple rule to dev
+ ctx.del_ruleid = rule->ruleid;
+ err = cfg_ethtool_rule(&ctx, true);
+ if (err) {
+ oecls_error("del sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n",
+ sk, rule->nid, devid, rule->action, rule->ruleid, err);
+ }
+
+ // Free the bound queue
+ free_rxq_id(rule->nid, devid, rule->action);
+
+ // Delete sk rule
+ del_sk_rule(rule);
+ }
+ mutex_unlock(&oecls_sk_rules.mutex);
+}
+
+static void add_ntuple_rule(struct sock *sk)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct cmd_context ctx = { 0 };
+ int cpu = smp_processor_id();
+ int nid = cpu_to_node(cpu);
+ int rxq_id;
+ int devid;
+ int err;
+
+ if (check_appname(current->comm))
+ return;
+ get_sk_rule_addr(sk, &ctx.dip4, &ctx.dport);
+
+ mutex_lock(&oecls_sk_rules.mutex);
+ for_each_oecls_netdev(devid, oecls_dev) {
+ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ);
+ if (reuseport_check(devid, ctx.dip4, ctx.dport)) {
+ oecls_error("dip4:%pI4, dport:%d reuse!\n", &ctx.dip4, ctx.dport);
+ continue;
+ }
+
+ // Calculate the bound queue
+ rxq_id = alloc_rxq_id(nid, devid);
+ if (rxq_id < 0)
+ continue;
+
+ // Config Ntuple rule to dev
+ ctx.action = (u16)rxq_id;
+ err = cfg_ethtool_rule(&ctx, false);
+ if (err) {
+ oecls_error("add sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n",
+ sk, nid, devid, ctx.action, ctx.ret_loc, err);
+ continue;
+ }
+
+ // Add sk rule
+ add_sk_rule(devid, ctx.dip4, ctx.dport, sk, ctx.action, ctx.ret_loc, nid);
+ }
+ mutex_unlock(&oecls_sk_rules.mutex);
+}
+
+static void ethtool_cfg_rxcls(void *data, struct sock *sk, int is_del)
+{
+ if (sk->sk_state != TCP_LISTEN)
+ return;
+
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ return;
+
+ oecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, ip:%pI4, port:%d\n", smp_processor_id(),
+ current->comm, sk, is_del, &sk->sk_rcv_saddr, (u16)sk->sk_num);
+
+ if (is_del)
+ del_ntuple_rule(sk);
+ else
+ add_ntuple_rule(sk);
+}
+
+static void clean_oecls_sk_rules(void)
+{
+ struct oecls_netdev_info *oecls_dev;
+ struct cmd_context ctx = { 0 };
+ struct oecls_sk_rule *rule;
+ struct hlist_head *hlist;
+ struct hlist_node *n;
+ unsigned int i;
+ int err;
+
+ mutex_lock(&oecls_sk_rules.mutex);
+ for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) {
+ hlist = &oecls_sk_rules.hash[i];
+
+ hlist_for_each_entry_safe(rule, n, hlist, node) {
+ oecls_dev = get_oecls_netdev_info(rule->devid);
+ if (!oecls_dev)
+ continue;
+ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ);
+ ctx.del_ruleid = rule->ruleid;
+ err = cfg_ethtool_rule(&ctx, true);
+ oecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk,
+ rule->devid, rule->action, rule->ruleid, err);
+
+ hlist_del(&rule->node);
+ oecls_debug("clean rule=%p\n", rule);
+ kfree(rule);
+ }
+ }
+ mutex_unlock(&oecls_sk_rules.mutex);
+}
+
+void oecls_ntuple_res_init(void)
+{
+ init_oecls_sk_rules();
+ register_trace_ethtool_cfg_rxcls(ðtool_cfg_rxcls, NULL);
+}
+
+void oecls_ntuple_res_clean(void)
+{
+ unregister_trace_ethtool_cfg_rxcls(ðtool_cfg_rxcls, NULL);
+ clean_oecls_sk_rules();
+}
--
2.33.0
2
1
From: Qizhi Zhang <zhangqizhi3(a)h-partners.com>
Updates of HiSilicon Uncore L3C PMU
---
Support new version of L3C PMU, which supports extended events space
which can be controlled in up to 2 extra address spaces with separate
overflow interrupts. The layout of the control/event registers are kept
the same. The extended events with original ones together cover the
monitoring job of all transactions on L3C.
That's said, the driver supports finer granual statistics of L3 cache
with separated and dedicated PMUs, and a new operand `ext` to give a
hint of to which part should perf counting command be delivered.
The extended events is specified with `ext=[1|2]` option for the driver
to distinguish:
perf stat -e hisi_sccl0_l3c0_0/event=<event_id>,ext=<ext>/
Currently only event option using config bit [7, 0]. There's still
plenty unused space. Make ext using config [16, 17] and reserve
bit [15, 8] for event option for future extension.
With the capability of extra counters, number of counters for HiSilicon
uncore PMU could reach up to 24, the usedmap is extended accordingly.
The hw_perf_event::event_base is initialized to the base MMIO address
of the event and will be used for later control, overflow handling and
counts readout.
We still make use of the Uncore PMU framework for handling the events
and interrupt migration on CPU hotplug. The framework's cpuhp callback
will handle the event migration and interrupt migration of orginial
event, if PMU supports extended events then the interrupt of extended
events is migrated to the same CPU choosed by the framework.
A new HID of HISI0215 is used for this version of L3C PMU.
Some necessary refactor is included, allowing the framework to cope with
the new version of driver.
Yicong Yang (1):
drivers/perf: hisi: Add support for L3C PMU v3
Yushan Wang (2):
Documentation: hisi-pmu: Fix of minor format error
Documentation: hisi-pmu: Add introduction to HiSilicon
Documentation/admin-guide/perf/hisi-pmu.rst | 44 ++-
drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 357 +++++++++----------
drivers/perf/hisilicon/hisi_uncore_pmu.h | 2 +-
3 files changed, 214 insertions(+), 189 deletions(-)
--
2.33.0
2
4
From: Yicong Yang <yangyicong(a)hisilicon.com>
driver inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ICFKG8
----------------------------------------------------------------------
HIP12 provides hardware metric sampling with adjacent counters
counter_2n and counter_2n+1. Overflow of counter_2n+1 will result in an
interrupt while overflow of counter_2n will load initial value, which
are stored in dedicated registers reload_counter_2n and
reload_counter_2n+1, to both counters.
With the ability above, software could only perform sampling during
handling interrupt of counter_2n and configure different values of
reload_counter_2n and reload_counter_2n+1, which realizes hardware
metric sampling.
For example,
perf record -e '\
{armv8_pmuv3_0/cpu_cycles,period=1000000,hw_metric=1/, \
armv8_pmuv3_0/inst_retired,period=800000,hw_metric=1/}:u' \
-- <workload>
Above command will only perform sampling when IPC < 800000 / 1000000,
since the interrupt will only appear when cpu_cycles reaches 1000000 and
inst_retired is less than 800000.
Signed-off-by: Yicong Yang <yangyicong(a)hisilicon.com>
Signed-off-by: Yushan Wang <wangyushan(a)hisilicon.com>
Signed-off-by: Qizhi Zhang <zhangqizhi3(a)h-partners.com>
---
arch/arm64/configs/openeuler_defconfig | 1 +
drivers/perf/Kconfig | 7 +
drivers/perf/arm_pmu.c | 9 ++
drivers/perf/arm_pmuv3.c | 207 ++++++++++++++++++++++++-
4 files changed, 222 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index f25dd0bd7790..ee5d7760aaf5 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -6912,6 +6912,7 @@ CONFIG_ARM_PMU_ACPI=y
CONFIG_ARM_SMMU_V3_PMU=m
CONFIG_ARM_PMUV3=y
CONFIG_ARM64_BRBE=y
+CONFIG_HISILICON_HW_METRIC=y
# CONFIG_ARM_DSU_PMU is not set
CONFIG_QCOM_L2_PMU=y
CONFIG_QCOM_L3_PMU=y
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 7b7b15f9bb6f..f608c2e66235 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -246,4 +246,11 @@ config CXL_PMU
If unsure say 'm'.
+config HISILICON_HW_METRIC
+ bool "HiSilicon hardware metric sampling support"
+ depends on ARM64
+ help
+ Support hardware metric that allows filter of sampling for specific
+ sampling period ratio.
+
endmenu
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 247b038ff4d9..9a97651b7afb 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -422,6 +422,15 @@ validate_group(struct perf_event *event)
*/
memset(&fake_pmu.used_mask, 0, sizeof(fake_pmu.used_mask));
+
+#ifdef CONFIG_HISILICON_HW_METRIC
+ /*
+ * Make percpu_pmu null so that PMU might get a chance to know if
+ * get_event_idx is called for validation.
+ */
+ fake_pmu.percpu_pmu = NULL;
+#endif
+
if (!validate_event(event->pmu, &fake_pmu, leader))
return -EINVAL;
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index c51206684863..29a659d5a273 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -320,6 +320,12 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = {
#define ATTR_CFG_FLD_threshold_LO 5
#define ATTR_CFG_FLD_threshold_HI 16
+#ifdef CONFIG_HISILICON_HW_METRIC
+#define ATTR_CFG_FLD_hw_metric_CFG config2
+#define ATTR_CFG_FLD_hw_metric_LO 0
+#define ATTR_CFG_FLD_hw_metric_HI 0
+#endif
+
GEN_PMU_FORMAT_ATTR(event);
GEN_PMU_FORMAT_ATTR(long);
GEN_PMU_FORMAT_ATTR(rdpmc);
@@ -327,6 +333,10 @@ GEN_PMU_FORMAT_ATTR(threshold_count);
GEN_PMU_FORMAT_ATTR(threshold_compare);
GEN_PMU_FORMAT_ATTR(threshold);
+#ifdef CONFIG_HISILICON_HW_METRIC
+GEN_PMU_FORMAT_ATTR(hw_metric);
+#endif
+
static int sysctl_perf_user_access __read_mostly;
static bool armv8pmu_event_is_64bit(struct perf_event *event)
@@ -358,6 +368,29 @@ static u8 armv8pmu_event_threshold_control(struct perf_event_attr *attr)
return (th_compare << 1) | th_count;
}
+#ifdef CONFIG_HISILICON_HW_METRIC
+static inline bool armv8pmu_event_is_hw_metric(struct perf_event *event)
+{
+ return ATTR_CFG_GET_FLD(&event->attr, hw_metric);
+}
+
+static bool armpmu_support_hisi_hw_metric(void)
+{
+ static const struct midr_range hip12_cpus[] = {
+ MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
+ { }
+ };
+
+ /*
+ * Feature of hw metric requires access to EL1 registers to accomplish,
+ * which will cause kernel panic in virtual machine because of lack of
+ * authority. Thus, this feature is banned for virtual machines.
+ */
+ return is_midr_in_range_list(read_cpuid_id(), hip12_cpus) &&
+ is_kernel_in_hyp_mode();
+}
+#endif
+
static struct attribute *armv8_pmuv3_format_attrs[] = {
&format_attr_event.attr,
&format_attr_long.attr,
@@ -365,11 +398,29 @@ static struct attribute *armv8_pmuv3_format_attrs[] = {
&format_attr_threshold.attr,
&format_attr_threshold_compare.attr,
&format_attr_threshold_count.attr,
+#ifdef CONFIG_HISILICON_HW_METRIC
+ &format_attr_hw_metric.attr,
+#endif
NULL,
};
+#ifdef CONFIG_HISILICON_HW_METRIC
+static umode_t
+armv8pmu_format_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ if (attr == &format_attr_hw_metric.attr && !armpmu_support_hisi_hw_metric())
+ return 0;
+
+ return attr->mode;
+}
+#endif
+
static const struct attribute_group armv8_pmuv3_format_attr_group = {
.name = "format",
+#ifdef CONFIG_HISILICON_HW_METRIC
+ .is_visible = armv8pmu_format_attr_is_visible,
+#endif
.attrs = armv8_pmuv3_format_attrs,
};
@@ -603,6 +654,41 @@ static void armv8pmu_write_evcntr(int idx, u64 value)
write_pmevcntrn(counter, value);
}
+#ifdef CONFIG_HISILICON_HW_METRIC
+static inline void armv8pmu_write_reload_counter(struct perf_event *event,
+ u64 value)
+{
+ /* Need to be event->hw.idx - 1 since counter 0 is PMCCNTR_EL0 */
+ int idx = event->hw.idx - 1;
+
+#define HW_METRIC_RELOAD_CNTR(n) sys_reg(3, 3, 15, 3, (2 + n))
+#define write_hw_metric_reload_cntr(_value, _n) \
+ do { \
+ switch (_n) { \
+ case 0: \
+ write_sysreg_s(_value, HW_METRIC_RELOAD_CNTR(0)); break; \
+ case 1: \
+ write_sysreg_s(_value, HW_METRIC_RELOAD_CNTR(1)); break; \
+ case 2: \
+ write_sysreg_s(_value, HW_METRIC_RELOAD_CNTR(2)); break; \
+ case 3: \
+ write_sysreg_s(_value, HW_METRIC_RELOAD_CNTR(3)); break; \
+ case 4: \
+ write_sysreg_s(_value, HW_METRIC_RELOAD_CNTR(4)); break; \
+ case 5: \
+ write_sysreg_s(_value, HW_METRIC_RELOAD_CNTR(5)); break; \
+ default: \
+ WARN(1, "Invalid hw_metric reload counter index\n"); \
+ dev_err(event->pmu->dev, "event is 0x%lx index is %x\n",\
+ event->hw.config_base, event->hw.idx); \
+ } \
+ } while (0)
+ write_hw_metric_reload_cntr(value, idx);
+#undef write_hw_metric_reload_cntr
+#undef HW_METRIC_RELOAD_CNTR
+}
+#endif
+
static void armv8pmu_write_hw_counter(struct perf_event *event,
u64 value)
{
@@ -614,6 +700,11 @@ static void armv8pmu_write_hw_counter(struct perf_event *event,
} else {
armv8pmu_write_evcntr(idx, value);
}
+
+#ifdef CONFIG_HISILICON_HW_METRIC
+ if (armv8pmu_event_is_hw_metric(event))
+ armv8pmu_write_reload_counter(event, value);
+#endif
}
static void armv8pmu_write_counter(struct perf_event *event, u64 value)
@@ -688,6 +779,38 @@ static void armv8pmu_enable_counter(u32 mask)
write_pmcntenset(mask);
}
+#ifdef CONFIG_HISILICON_HW_METRIC
+static inline void armv8pmu_enable_hw_metric(struct perf_event *event, bool enable)
+{
+ int idx = event->hw.idx;
+ u64 reg;
+
+ /*
+ * Configure the chicken bit on leader event enabling.
+ */
+ if (event != event->group_leader)
+ return;
+
+ /* Convert the idx since we only use general counters, counter 0 is
+ * used for PMCCNTR_EL0.
+ */
+ idx -= 1;
+
+#define HISI_DTU_CTLR_EL1 sys_reg(3, 0, 15, 8, 4)
+#define HISI_DTU_CTLR_EL1_CHK_GROUP0 BIT(15)
+
+ reg = read_sysreg_s(HISI_DTU_CTLR_EL1);
+ if (enable)
+ reg |= HISI_DTU_CTLR_EL1_CHK_GROUP0 << (idx >> 1);
+ else
+ reg &= ~(HISI_DTU_CTLR_EL1_CHK_GROUP0 << (idx >> 1));
+
+ write_sysreg_s(reg, HISI_DTU_CTLR_EL1);
+
+ reg = read_sysreg_s(HISI_DTU_CTLR_EL1);
+}
+#endif
+
static void armv8pmu_enable_event_counter(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
@@ -696,8 +819,14 @@ static void armv8pmu_enable_event_counter(struct perf_event *event)
kvm_set_pmu_events(mask, attr);
/* We rely on the hypervisor switch code to enable guest counters */
- if (!kvm_pmu_counter_deferred(attr))
+ if (!kvm_pmu_counter_deferred(attr)) {
armv8pmu_enable_counter(mask);
+
+#ifdef CONFIG_HISILICON_HW_METRIC
+ if (armv8pmu_event_is_hw_metric(event))
+ armv8pmu_enable_hw_metric(event, true);
+#endif
+ }
}
static void armv8pmu_disable_counter(u32 mask)
@@ -718,8 +847,14 @@ static void armv8pmu_disable_event_counter(struct perf_event *event)
kvm_clr_pmu_events(mask);
/* We rely on the hypervisor switch code to disable guest counters */
- if (!kvm_pmu_counter_deferred(attr))
+ if (!kvm_pmu_counter_deferred(attr)) {
armv8pmu_disable_counter(mask);
+
+#ifdef CONFIG_HISILICON_HW_METRIC
+ if (armv8pmu_event_is_hw_metric(event))
+ armv8pmu_enable_hw_metric(event, false);
+#endif
+ }
}
static void armv8pmu_enable_intens(u32 mask)
@@ -1005,6 +1140,61 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
return -EAGAIN;
}
+#ifdef CONFIG_HISILICON_HW_METRIC
+static int armv8pmu_check_hw_metric_event(struct pmu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+ int hw_metric_cnt = 0;
+
+ if (cpuc->percpu_pmu) {
+ for_each_sibling_event(sibling, leader) {
+ if (armv8pmu_event_is_hw_metric(sibling))
+ hw_metric_cnt++;
+ }
+
+ if (hw_metric_cnt != 1)
+ return -EINVAL;
+ } else {
+ if (event == leader)
+ return 0;
+
+ if (!armv8pmu_event_is_hw_metric(leader))
+ return -EINVAL;
+
+ for_each_sibling_event(sibling, leader) {
+ if (armv8pmu_event_is_hw_metric(sibling))
+ hw_metric_cnt++;
+ }
+
+ if (hw_metric_cnt > 0)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int armv8pmu_get_hw_metric_event_idx(struct pmu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct perf_event *leader = event->group_leader;
+ int leader_idx;
+
+ if (armv8pmu_check_hw_metric_event(cpuc, event))
+ return -EINVAL;
+
+ if (event == leader || leader->hw.idx < 1)
+ return armv8pmu_get_chain_idx(cpuc, cpu_pmu);
+
+ leader_idx = leader->hw.idx;
+ if (cpuc->events[leader_idx - 1])
+ return -EAGAIN;
+
+ return leader_idx - 1;
+}
+#endif
+
static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
struct perf_event *event)
{
@@ -1012,6 +1202,14 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
struct hw_perf_event *hwc = &event->hw;
unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
+#ifdef CONFIG_HISILICON_HW_METRIC
+ if (armv8pmu_event_is_hw_metric(event))
+ return armv8pmu_get_hw_metric_event_idx(cpuc, event);
+ else if (event != event->group_leader &&
+ armv8pmu_event_is_hw_metric(event->group_leader))
+ return -EINVAL;
+#endif
+
/* Always prefer to place a cycle counter into the cycle counter. */
if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
!armv8pmu_event_get_threshold(&event->attr)) {
@@ -1235,6 +1433,11 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
if (armv8pmu_event_is_64bit(event))
event->hw.flags |= ARMPMU_EVT_64BIT;
+#ifdef CONFIG_HISILICON_HW_METRIC
+ if (armv8pmu_event_is_hw_metric(event) && !armpmu_support_hisi_hw_metric())
+ return -EOPNOTSUPP;
+#endif
+
/*
* User events must be allocated into a single counter, and so
* must not be chained.
--
2.33.0
2
1

[PATCH OLK-6.6 0/1] kvm: hisi_virt: Fix the socket_id of broadcast bitmap for another socket
by Jinqian Yang 26 Aug '25
by Jinqian Yang 26 Aug '25
26 Aug '25
Zhou Wang (1):
kvm: hisi_virt: Fix the socket_id of broadcast bitmap for another
socket
arch/arm64/kvm/hisilicon/hisi_virt.c | 12 +++++++++---
arch/arm64/kvm/hisilicon/hisi_virt.h | 5 +++++
2 files changed, 14 insertions(+), 3 deletions(-)
--
2.33.0
2
2
Pu Lehui (10):
bpf: Add CONFIG_HISOCK
bpf: Add XDP_HISOCK_REDIRECT action
bpf: Add BPF_PROG_TYPE_HISOCK prog type
bpf: Add HISOCK_EGRESS hook on network egress path
bpf: Add bpf_get_ingress_dst helper
bpf: Add hisock_xdp_buff wrapper for xdp_buff
bpf: Add bpf_set_ingress_dst helper
bpf: Add bpf_change_skb_dev helper
openeuler_defconfig: Enable CONFIG_HISOCK
samples/bpf: Add HiSock Redirect sample
Xu Kuohai (1):
bpf: Add bpf_ext_memcpy extension helper for arm64
arch/arm64/configs/openeuler_defconfig | 1 +
arch/arm64/include/asm/insn.h | 4 +
arch/arm64/lib/insn.c | 8 +
arch/arm64/net/bpf_jit.h | 15 +
arch/arm64/net/bpf_jit_comp.c | 266 ++++++++++++++++
arch/x86/configs/openeuler_defconfig | 1 +
include/linux/bpf-cgroup-defs.h | 3 +
include/linux/bpf-cgroup.h | 25 ++
include/linux/bpf_types.h | 4 +
include/linux/filter.h | 3 +
include/net/xdp.h | 5 +
include/uapi/linux/bpf.h | 45 +++
kernel/bpf/cgroup.c | 43 +++
kernel/bpf/core.c | 7 +
kernel/bpf/helpers.c | 27 ++
kernel/bpf/syscall.c | 19 ++
kernel/bpf/verifier.c | 27 ++
net/Kconfig | 10 +
net/core/dev.c | 76 ++++-
net/core/filter.c | 182 +++++++++++
net/ipv4/ip_output.c | 68 +++++
samples/bpf/.gitignore | 1 +
samples/bpf/Makefile | 3 +
samples/bpf/hisock/bpf.c | 247 +++++++++++++++
samples/bpf/hisock/hisock_cmd.c | 405 +++++++++++++++++++++++++
tools/include/uapi/linux/bpf.h | 45 +++
tools/lib/bpf/libbpf.c | 3 +
27 files changed, 1540 insertions(+), 3 deletions(-)
create mode 100644 samples/bpf/hisock/bpf.c
create mode 100644 samples/bpf/hisock/hisock_cmd.c
--
2.34.1
2
12
Jinjiang Tu (2):
mm/vmscan: don't try to reclaim hwpoison folio
mm/vmscan: fix hwpoisoned large folio handling in shrink_folio_list
mm/vmscan.c | 10 ++++++++++
1 file changed, 10 insertions(+)
--
2.43.0
2
3