[PATCH OLK-5.10] net/oenetcls: introduce oenetcls for network optimization

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- This introduces a kind of network optimization method named oenetcls. It can configure the ntuple rule, and bind interrupt to the netdev queue automatically. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> Signed-off-by: Wang Liang <wangliang74@huawei.com> Signed-off-by: Liu Jian <liujian56@huawei.com> Signed-off-by: yuelg <yuelg@chinaunicom.cn> --- arch/arm64/configs/openeuler_defconfig | 2 + arch/x86/configs/openeuler_defconfig | 1 + drivers/hooks/Kconfig | 10 + drivers/hooks/vendor_hooks.c | 8 + include/linux/netdevice.h | 2 +- include/trace/hooks/oenetcls.h | 44 ++ net/Kconfig | 1 + net/Makefile | 1 + net/core/dev.c | 21 + net/ipv4/af_inet.c | 5 + net/ipv4/tcp.c | 10 + net/oenetcls/Kconfig | 10 + net/oenetcls/Makefile | 8 + net/oenetcls/asmdefs.h | 61 ++ net/oenetcls/memcpy-sve.S | 157 ++++ net/oenetcls/oenetcls.h | 176 +++++ net/oenetcls/oenetcls_flow.c | 402 ++++++++++ net/oenetcls/oenetcls_l0.c | 193 +++++ net/oenetcls/oenetcls_main.c | 972 +++++++++++++++++++++++++ net/oenetcls/oenetcls_ntuple.c | 510 +++++++++++++ 20 files changed, 2593 insertions(+), 1 deletion(-) create mode 100644 include/trace/hooks/oenetcls.h create mode 100644 net/oenetcls/Kconfig create mode 100644 net/oenetcls/Makefile create mode 100644 net/oenetcls/asmdefs.h create mode 100644 net/oenetcls/memcpy-sve.S create mode 100644 net/oenetcls/oenetcls.h create mode 100644 net/oenetcls/oenetcls_flow.c create mode 100644 net/oenetcls/oenetcls_l0.c create mode 100644 net/oenetcls/oenetcls_main.c create mode 100644 net/oenetcls/oenetcls_ntuple.c diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 4a11c9f061cc..c7c652a295e1 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6260,6 +6260,8 @@ CONFIG_USB4=m # CONFIG_VENDOR_HOOKS=y CONFIG_VENDOR_BOND_HOOKS=y +CONFIG_OENETCLS_HOOKS=y +CONFIG_OENETCLS=m # end of Vendor Hooks CONFIG_LIBNVDIMM=m diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 4fed721fb07a..c9e9e1a76881 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7323,6 +7323,7 @@ CONFIG_USB4=m # CONFIG_VENDOR_HOOKS=y CONFIG_VENDOR_BOND_HOOKS=y +# CONFIG_OENETCLS_HOOKS is not set # end of Vendor Hooks CONFIG_LIBNVDIMM=m diff --git a/drivers/hooks/Kconfig b/drivers/hooks/Kconfig index 6a00168e67ad..90b0f6ea4040 100644 --- a/drivers/hooks/Kconfig +++ b/drivers/hooks/Kconfig @@ -20,4 +20,14 @@ config VENDOR_BOND_HOOKS Allow vendor modules to attach bonding driver hooks defined via DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. +config OENETCLS_HOOKS + bool "Oenetcls driver Hooks" + depends on VENDOR_HOOKS + default n + help + Enable oenetcls vendor hooks + Allow vendor modules to attach oenetcls hooks defined via + DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. + Use OENETCLS && OENETCLS_HOOKS to enable oenetcls feature. + endmenu diff --git a/drivers/hooks/vendor_hooks.c b/drivers/hooks/vendor_hooks.c index 85bda58159f6..d9b85b57a742 100644 --- a/drivers/hooks/vendor_hooks.c +++ b/drivers/hooks/vendor_hooks.c @@ -9,6 +9,7 @@ #define CREATE_TRACE_POINTS #include <trace/hooks/vendor_hooks.h> #include <trace/hooks/bonding.h> +#include <trace/hooks/oenetcls.h> /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -18,3 +19,10 @@ #ifdef CONFIG_VENDOR_BOND_HOOKS EXPORT_TRACEPOINT_SYMBOL_GPL(vendor_bond_check_dev_link); #endif + +#ifdef CONFIG_OENETCLS_HOOKS +EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_flow_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_set_cpu); +EXPORT_TRACEPOINT_SYMBOL_GPL(oecls_timeout); +EXPORT_TRACEPOINT_SYMBOL_GPL(ethtool_cfg_rxcls); +#endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df61a63a5550..4b7b6fa1dcb1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -808,7 +808,7 @@ struct netdev_rx_queue { struct xsk_buff_pool *pool; #endif - KABI_RESERVE(1) + KABI_USE(1, void *__rcu oecls_ftb) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/trace/hooks/oenetcls.h b/include/trace/hooks/oenetcls.h new file mode 100644 index 000000000000..c38545d7a6a2 --- /dev/null +++ b/include/trace/hooks/oenetcls.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * oenetcls driver Hooks + * + * Copyright (c) 2025, Huawei Tech. Co., Ltd. + */ + +#ifdef CONFIG_OENETCLS_HOOKS + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM oenetcls + +#define TRACE_INCLUDE_PATH trace/hooks +#if !defined(_TRACE_OENETCLS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OENETCLS_H +#include <linux/tracepoint.h> +#include <trace/hooks/vendor_hooks.h> + +struct sock; +struct sk_buff; +struct net_device; + +DECLARE_HOOK(oecls_flow_update, +TP_PROTO(struct sock *sk), +TP_ARGS(sk)); + +DECLARE_HOOK(oecls_set_cpu, +TP_PROTO(struct sk_buff *skb), +TP_ARGS(skb)); + +DECLARE_HOOK(oecls_timeout, +TP_PROTO(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id, bool *ret), +TP_ARGS(dev, rxq_index, flow_id, filter_id, ret)); + +DECLARE_HOOK(ethtool_cfg_rxcls, +TP_PROTO(struct sock *sk, int is_del), +TP_ARGS(sk, is_del)); + +#endif +/* This part must be outside protection */ +#include <trace/define_trace.h> + +#endif + diff --git a/net/Kconfig b/net/Kconfig index 51a934426f9f..071f8ee3b89f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -67,6 +67,7 @@ source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" source "net/xdp/Kconfig" +source "net/oenetcls/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/Makefile b/net/Makefile index 45c03aa92ace..dc637db866e3 100644 --- a/net/Makefile +++ b/net/Makefile @@ -87,3 +87,4 @@ obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ +obj-$(CONFIG_OENETCLS) += oenetcls/ diff --git a/net/core/dev.c b/net/core/dev.c index c49f3d306b5c..eda1975ef55b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -149,6 +149,7 @@ #include <net/net_rship.h> #include "net-sysfs.h" +#include <trace/hooks/oenetcls.h> #define MAX_GRO_SKBS 8 @@ -4484,6 +4485,11 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, bool expire = true; unsigned int cpu; +#if IS_ENABLED(CONFIG_OENETCLS_HOOKS) + trace_oecls_timeout(dev, rxq_index, flow_id, filter_id, &expire); + if (expire) + return true; +#endif rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { @@ -5819,6 +5825,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) } } #endif + +#if IS_ENABLED(CONFIG_OENETCLS_HOOKS) + trace_oecls_set_cpu(skb); +#endif + ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; @@ -5853,6 +5864,12 @@ static void netif_receive_skb_list_internal(struct list_head *head) } } #endif + +#if IS_ENABLED(CONFIG_OENETCLS_HOOKS) + list_for_each_entry_safe(skb, next, head, list) + trace_oecls_set_cpu(skb); +#endif + __netif_receive_skb_list(head); rcu_read_unlock(); } @@ -9984,6 +10001,10 @@ int __netdev_update_features(struct net_device *dev) return err < 0 ? 0 : 1; } +#if IS_ENABLED(CONFIG_OENETCLS) +EXPORT_SYMBOL(__netdev_update_features); +#endif + /** * netdev_update_features - recalculate device features * @dev: the device to check diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c6ec22f314c4..b09b1d6575a4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,7 @@ #include <net/compat.h> #include <trace/events/sock.h> +#include <trace/hooks/oenetcls.h> /* The inetsw table contains everything that inet_create needs to * build a new socket. @@ -232,6 +233,10 @@ int inet_listen(struct socket *sock, int backlog) if (err) goto out; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); + +#if IS_ENABLED(CONFIG_OENETCLS_HOOKS) + trace_ethtool_cfg_rxcls(sk, 0); +#endif } err = 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2a5147a4d1af..f014fad55bbf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -280,6 +280,7 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> #include <net/net_rship.h> +#include <trace/hooks/oenetcls.h> DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); @@ -2176,6 +2177,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_OENETCLS_HOOKS) + trace_oecls_flow_update(sk); +#endif + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); @@ -2750,6 +2755,11 @@ void __tcp_close(struct sock *sk, long timeout) void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); + +#if IS_ENABLED(CONFIG_OENETCLS_HOOKS) + trace_ethtool_cfg_rxcls(sk, 1); +#endif + __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) diff --git a/net/oenetcls/Kconfig b/net/oenetcls/Kconfig new file mode 100644 index 000000000000..128b798d6b7f --- /dev/null +++ b/net/oenetcls/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config OENETCLS + tristate "Network classification" + depends on HISI_L0 && OENETCLS_HOOKS + default n + help + Allows to configure ntuple rule, and bind interrupt to netdev + automatically. + Use OENETCLS && OENETCLS_HOOKS to enable oenetcls feature. + Use parameter mode to decide running mode. diff --git a/net/oenetcls/Makefile b/net/oenetcls/Makefile new file mode 100644 index 000000000000..4d395d36a5fc --- /dev/null +++ b/net/oenetcls/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_OENETCLS) = oenetcls.o +oenetcls-y := oenetcls_main.o oenetcls_ntuple.o oenetcls_flow.o oenetcls_l0.o +ifeq ($(CONFIG_ARM64_SVE),y) +oenetcls-y += memcpy-sve.o +endif + diff --git a/net/oenetcls/asmdefs.h b/net/oenetcls/asmdefs.h new file mode 100644 index 000000000000..8138a94c18af --- /dev/null +++ b/net/oenetcls/asmdefs.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name, %function; \ + .align alignment; \ +name: \ + .cfi_startproc; \ + BTI_C; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name, %function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#endif diff --git a/net/oenetcls/memcpy-sve.S b/net/oenetcls/memcpy-sve.S new file mode 100644 index 000000000000..106e4c30294c --- /dev/null +++ b/net/oenetcls/memcpy-sve.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#include "asmdefs.h" + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h new file mode 100644 index 000000000000..72aeedcaf0a0 --- /dev/null +++ b/net/oenetcls/oenetcls.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _NET_OENETCLS_H +#define _NET_OENETCLS_H +#include <linux/if.h> +#include <linux/mutex.h> +#include <linux/cpufeature.h> + +#define OECLS_MAX_NETDEV_NUM 8 +#define OECLS_MAX_RXQ_NUM_PER_DEV 256 +#define OECLS_MAX_NUMA_NUM 16 +#define OECLS_MAX_CPU_NUM 1024 + +#define OECLS_TIMEOUT (5 * HZ) +#define OECLS_NO_FILTER 0xffff +#define OECLS_NO_CPU 0xffff + +struct oecls_netdev_queue_info { + int irq; + int affinity_cpu; +}; + +struct oecls_netdev_info { + char dev_name[IFNAMSIZ]; + struct net_device *netdev; + int rxq_num; + struct oecls_netdev_queue_info rxq[OECLS_MAX_RXQ_NUM_PER_DEV]; + int old_filter_state; +}; + +struct oecls_numa_bound_dev_info { + DECLARE_BITMAP(bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); +}; + +struct oecls_numa_info { + DECLARE_BITMAP(avail_cpus, OECLS_MAX_CPU_NUM); + struct oecls_numa_bound_dev_info bound_dev[OECLS_MAX_NETDEV_NUM]; +}; + +struct cmd_context { + char netdev[IFNAMSIZ]; + u32 dip4; + u16 dport; + u16 action; + u32 ruleid; + u32 del_ruleid; + int ret_loc; +}; + +#define OECLS_SK_RULE_HASHSIZE 256 +#define OECLS_SK_RULE_HASHMASK (OECLS_SK_RULE_HASHSIZE - 1) + +struct oecls_sk_rule_list { + struct hlist_head hash[OECLS_SK_RULE_HASHSIZE]; + /* Mutex to synchronize access to ntuple rule locking */ + struct mutex mutex; +}; + +struct oecls_sk_rule { + struct hlist_node node; + int devid; + void *sk; + int dip4; + int dport; + int action; + int ruleid; + int nid; +}; + +struct oecls_dev_flow { + unsigned short cpu; + unsigned short filter; + unsigned int last_qtail; + int isvalid; + unsigned long timeout; +}; + +struct oecls_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct oecls_dev_flow flows[]; +}; + +struct oecls_sock_flow_table { + u32 mask; + u32 ents[] ____cacheline_aligned_in_smp; +}; + +#define OECLS_DEV_FLOW_TABLE_NUM 0x1000 +#define OECLS_SOCK_FLOW_TABLE_NUM 0x100000 +#define OECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct oecls_dev_flow_table) + \ + ((_num) * sizeof(struct oecls_dev_flow))) +#define OECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct oecls_sock_flow_table, ents[_num])) + +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ + NETIF_F_RXHASH) + +struct rmgr_ctrl { + int driver_select; + unsigned long *slot; + __u32 n_rules; + __u32 size; +}; + +extern int match_ip_flag; +extern int debug; +extern int oecls_netdev_num; +extern int oecls_numa_num; + +#define oecls_debug(fmt, ...) \ + do { \ + if (debug) \ + trace_printk(fmt, ## __VA_ARGS__); \ + } while (0) + +#define oecls_error(fmt, ...) \ + do { \ + pr_err("oenetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \ + trace_printk(fmt, ## __VA_ARGS__); \ + } while (0) + +struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index); + +#define for_each_oecls_netdev(devid, oecls_dev) \ + for (devid = 0, oecls_dev = get_oecls_netdev_info(devid); \ + (devid < oecls_netdev_num) && oecls_dev; \ + devid++, oecls_dev = get_oecls_netdev_info(devid)) + +struct oecls_numa_info *get_oecls_numa_info(unsigned int nid); + +#define for_each_oecls_numa(nid, numa_info) \ + for (nid = 0, numa_info = get_oecls_numa_info(nid); \ + (nid < oecls_numa_num) && numa_info; \ + nid++, numa_info = get_oecls_numa_info(nid)) + +#ifdef CONFIG_ARM64_SVE +void *__memcpy_aarch64_sve(void *, const void *, size_t); +#define memcpy_r(dst, src, len) \ + do { \ + if (system_supports_sve()) \ + __memcpy_aarch64_sve(dst, src, len); \ + else \ + memcpy(dst, src, len); \ + } while (0) +#else +#define memcpy_r(dst, src, len) memcpy(dst, src, len) +#endif + +int check_appname(char *task_name); +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd); +int alloc_rxq_id(int nid, int devid); +void free_rxq_id(int nid, int devid, int rxq_id); +void oecls_ntuple_res_init(void); +void oecls_ntuple_res_clean(void); +void oecls_flow_res_init(void); +void oecls_flow_res_clean(void); + +#define L0_MAX_PAGE_SIZE (8192) +#define L0_MAX_PAGE_NUM (4096) + +struct l0_vma_data { + struct page *page; + unsigned long size; + int nid; +}; + +void clean_oecls_l0_cache(void); +void init_oecls_l0_cache(void); +void *alloc_from_l0(int size); +void free_to_l0(void *addr); +int l3t_shared_lock(int nid, unsigned long pfn, unsigned long size); +int l3t_shared_unlock(int nid, unsigned long pfn, unsigned long size); + +#endif /* _NET_OENETCLS_H */ diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c new file mode 100644 index 000000000000..c35ce5aca416 --- /dev/null +++ b/net/oenetcls/oenetcls_flow.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/inetdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/inet.h> +#include <net/sock.h> +#include <trace/hooks/oenetcls.h> +#include "oenetcls.h" + +static u32 oecls_cpu_mask; +static struct oecls_sock_flow_table __rcu *oecls_sock_flow_table; +static DEFINE_MUTEX(oecls_sock_flow_mutex); +static DEFINE_SPINLOCK(oecls_dev_flow_lock); + +bool is_oecls_config_netdev(const char *name) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + + for_each_oecls_netdev(netdev_loop, netdev_info) + if (strcmp(netdev_info->dev_name, name) == 0) + return true; + + return false; +} + +static void oecls_timeout(void *data, struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id, bool *ret) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct oecls_dev_flow_table *flow_table; + struct oecls_dev_flow *rflow; + bool expire = true; + unsigned int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->oecls_ftb); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = READ_ONCE(rflow->cpu); + oecls_debug("dev:%s, rxq:%d, flow_id:%u, filter_id:%d/%d, cpu:%d", dev->name, + rxq_index, flow_id, filter_id, rflow->filter, cpu); + + if (rflow->filter == filter_id && cpu < nr_cpu_ids) { + if (time_before(jiffies, rflow->timeout + OECLS_TIMEOUT)) { + expire = false; + } else { + rflow->isvalid = 0; + WRITE_ONCE(rflow->cpu, OECLS_NO_CPU); + } + } + } + rcu_read_unlock(); + oecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__, + dev->name, rxq_index, flow_id, filter_id, expire); + *ret = expire; +} + +static void oecls_flow_update(void *data, struct sock *sk) +{ + struct oecls_sock_flow_table *tb; + unsigned int hash, index; + u32 val; + u32 cpu = raw_smp_processor_id(); + + if (sk->sk_state != TCP_ESTABLISHED) + return; + + if (check_appname(current->comm)) + return; + + rcu_read_lock(); + tb = rcu_dereference(oecls_sock_flow_table); + hash = READ_ONCE(sk->sk_rxhash); + if (tb && hash) { + index = hash & tb->mask; + val = hash & ~oecls_cpu_mask; + val |= cpu; + + if (READ_ONCE(tb->ents[index]) != val) { + WRITE_ONCE(tb->ents[index], val); + + oecls_debug("[%s] sk:%p, hash:0x%x, index:0x%x, val:0x%x, cpu:%d\n", + current->comm, sk, hash, index, val, cpu); + } + } + rcu_read_unlock(); +} + +static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + u32 hash, index; + struct oecls_numa_info *numa_info; + struct oecls_numa_bound_dev_info *bound_dev = NULL; + int rxq_id, rxq_num, i; + + numa_info = get_oecls_numa_info(nid); + if (!numa_info) + return -1; + + for_each_oecls_netdev(netdev_loop, netdev_info) { + if (strcmp(netdev_info->dev_name, dev->name) == 0) { + bound_dev = &numa_info->bound_dev[netdev_loop]; + break; + } + } + + if (!bound_dev) + return -1; + rxq_num = bitmap_weight(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); + if (rxq_num == 0) + return -1; + + hash = skb_get_hash(skb); + index = hash % rxq_num; + + i = 0; + for_each_set_bit(rxq_id, bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV) + if (index == i++) + return rxq_id; + + return -1; +} + +static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb, + struct oecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu) +{ + struct netdev_rx_queue *rxqueue; + struct oecls_dev_flow_table *dtb; + struct oecls_dev_flow *rflow; + u32 flow_id, hash; + u16 rxq_index; + int rc; + + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + return; + + rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); + if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) + return; + + rxqueue = dev->_rx + rxq_index; + dtb = rcu_dereference(rxqueue->oecls_ftb); + if (!dtb) + return; + + hash = skb_get_hash(skb); + flow_id = hash & dtb->mask; + rflow = &dtb->flows[flow_id]; + if (rflow->isvalid && rflow->cpu == next_cpu) { + rflow->timeout = jiffies; + return; + } + + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); + oecls_debug("skb:%p, rxq:%d, hash:0x%x, flow_id:%u, old_rxq_id:%d, next_cpu:%d, rc:%d\n", + skb, rxq_index, hash, flow_id, old_rxq_id, next_cpu, rc); + if (rc < 0) + return; + + rflow->filter = rc; + rflow->isvalid = 1; + rflow->timeout = jiffies; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = OECLS_NO_FILTER; + rflow->cpu = next_cpu; +} + +static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, + struct oecls_sock_flow_table *tb, struct oecls_dev_flow_table *dtb, + int old_rxq_id) +{ + struct oecls_dev_flow *rflow; + u32 last_recv_cpu, hash, val; + u32 tcpu = 0; + u32 cpu = raw_smp_processor_id(); + + skb_reset_network_header(skb); + hash = skb_get_hash(skb); + if (!hash) + return; + + val = READ_ONCE(tb->ents[hash & tb->mask]); + last_recv_cpu = val & oecls_cpu_mask; + rflow = &dtb->flows[hash & dtb->mask]; + tcpu = rflow->cpu; + + if ((val ^ hash) & ~oecls_cpu_mask) + return; + + if (cpu_to_node(cpu) == cpu_to_node(last_recv_cpu)) + return; + + if (tcpu >= nr_cpu_ids) + set_oecls_cpu(ndev, skb, rflow, old_rxq_id, last_recv_cpu); +} + +static void oecls_set_cpu(void *data, struct sk_buff *skb) +{ + struct net_device *ndev = skb->dev; + struct oecls_sock_flow_table *stb; + struct oecls_dev_flow_table *dtb; + struct netdev_rx_queue *rxqueue; + int rxq_id = -1; + + if (!ndev) + return; + + if (!is_oecls_config_netdev(ndev->name)) + return; + + rxqueue = ndev->_rx; + if (skb_rx_queue_recorded(skb)) { + rxq_id = skb_get_rx_queue(skb); + if (rxq_id >= ndev->real_num_rx_queues) { + oecls_debug("ndev:%s, rxq:%d, real_num:%d\n", ndev->name, + rxq_id, ndev->real_num_rx_queues); + return; + } + rxqueue += rxq_id; + } + + // oecls_debug("skb:%px, dev:%s, rxq_id:%d\n", skb, ndev->name, rxq_id); + if (rxq_id < 0) + return; + + rcu_read_lock(); + stb = rcu_dereference(oecls_sock_flow_table); + dtb = rcu_dereference(rxqueue->oecls_ftb); + if (stb && dtb) + __oecls_set_cpu(skb, ndev, stb, dtb, rxq_id); + + rcu_read_unlock(); +} + +static void oecls_dev_flow_table_free(struct rcu_head *rcu) +{ + struct oecls_dev_flow_table *table = container_of(rcu, + struct oecls_dev_flow_table, rcu); + vfree(table); +} + +static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int qid) +{ + struct oecls_dev_flow_table *dtb; + struct netdev_rx_queue *queue; + int i; + + spin_lock(&oecls_dev_flow_lock); + for (i = 0; i < qid; i++) { + queue = netdev->_rx + i; + dtb = rcu_dereference_protected(queue->oecls_ftb, + lockdep_is_held(&oecls_dev_flow_lock)); + rcu_assign_pointer(queue->oecls_ftb, NULL); + } + spin_unlock(&oecls_dev_flow_lock); + call_rcu(&dtb->rcu, oecls_dev_flow_table_free); +} + +static int oecls_dev_flow_table_release(void) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + struct net_device *netdev; + + for_each_oecls_netdev(netdev_loop, netdev_info) { + netdev = netdev_info->netdev; + if (!netdev) + continue; + oecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues); + } + + return 0; +} + +static int _oecls_dev_flow_table_init(struct net_device *netdev) +{ + struct oecls_dev_flow_table *table; + int size = OECLS_DEV_FLOW_TABLE_NUM; + struct netdev_rx_queue *queue; + int i, j, ret = 0; + + size = roundup_pow_of_two(size); + oecls_debug("dev:%s, num_rx_queues:%d, mask:0x%x\n", netdev->name, netdev->num_rx_queues, + size - 1); + + for (i = 0; i < netdev->num_rx_queues; i++) { + table = vmalloc(OECLS_DEV_FLOW_TABLE_SIZE(size)); + if (!table) { + ret = -ENOMEM; + goto fail; + } + + table->mask = size - 1; + for (j = 0; j < size; j++) { + table->flows[j].cpu = OECLS_NO_CPU; + table->flows[j].isvalid = 0; + } + + queue = netdev->_rx + i; + + spin_lock(&oecls_dev_flow_lock); + rcu_assign_pointer(queue->oecls_ftb, table); + spin_unlock(&oecls_dev_flow_lock); + } + return ret; +fail: + oecls_dev_flow_table_cleanup(netdev, i); + return ret; +} + +static int oecls_dev_flow_table_init(void) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + struct net_device *ndev; + int i, err; + + for_each_oecls_netdev(netdev_loop, netdev_info) { + ndev = netdev_info->netdev; + if (!ndev) + continue; + err = _oecls_dev_flow_table_init(ndev); + if (err) + goto out; + } + + return 0; +out: + for (i = 0; i < netdev_loop; i++) { + netdev_info = get_oecls_netdev_info(i); + ndev = netdev_info->netdev; + if (!ndev) + continue; + oecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues); + } + return err; +} + +static int oecls_sock_flow_table_release(void) +{ + struct oecls_sock_flow_table *tb; + + mutex_lock(&oecls_sock_flow_mutex); + tb = rcu_dereference_protected(oecls_sock_flow_table, + lockdep_is_held(&oecls_sock_flow_mutex)); + if (tb) + rcu_assign_pointer(oecls_sock_flow_table, NULL); + mutex_unlock(&oecls_sock_flow_mutex); + synchronize_rcu(); + vfree(tb); + + unregister_trace_oecls_flow_update(&oecls_flow_update, NULL); + unregister_trace_oecls_set_cpu(&oecls_set_cpu, NULL); + unregister_trace_oecls_timeout(&oecls_timeout, NULL); + return 0; +} + +static int oecls_sock_flow_table_init(void) +{ + struct oecls_sock_flow_table *table; + int size = OECLS_SOCK_FLOW_TABLE_NUM; + int i; + + size = roundup_pow_of_two(size); + table = vmalloc(OECLS_SOCK_FLOW_TABLE_SIZE(size)); + if (!table) + return -ENOMEM; + + oecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + oecls_debug("nr_cpu_ids:%d, oecls_cpu_mask:0x%x\n", nr_cpu_ids, oecls_cpu_mask); + + table->mask = size - 1; + for (i = 0; i < size; i++) + table->ents[i] = OECLS_NO_CPU; + + mutex_lock(&oecls_sock_flow_mutex); + rcu_assign_pointer(oecls_sock_flow_table, table); + mutex_unlock(&oecls_sock_flow_mutex); + + register_trace_oecls_flow_update(oecls_flow_update, NULL); + register_trace_oecls_set_cpu(&oecls_set_cpu, NULL); + register_trace_oecls_timeout(&oecls_timeout, NULL); + return 0; +} + +void oecls_flow_res_init(void) +{ + oecls_sock_flow_table_init(); + oecls_dev_flow_table_init(); +} + +void oecls_flow_res_clean(void) +{ + oecls_sock_flow_table_release(); + oecls_dev_flow_table_release(); +} diff --git a/net/oenetcls/oenetcls_l0.c b/net/oenetcls/oenetcls_l0.c new file mode 100644 index 000000000000..b820a1cf96bd --- /dev/null +++ b/net/oenetcls/oenetcls_l0.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/kallsyms.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/mempolicy.h> +#include <linux/pfn_t.h> +#include <linux/delay.h> +#include "oenetcls.h" + +static struct vm_area_struct *oecls_l0_cache __ro_after_init; +static void *oecls_l0_cache_head; +static atomic_t alloc_num_l0 = ATOMIC_INIT(0); +static int cur_offset_l0; +static u8 slot_l0[L0_MAX_PAGE_NUM] = { 0 }; + +void *alloc_from_l0(int size) +{ + int i; + void *addr = NULL; + + if (!oecls_l0_cache_head) + return kzalloc(size, GFP_ATOMIC); + + if (size > L0_MAX_PAGE_SIZE) { + oecls_error("size(%d) >= max_size(%d)!\n", size, L0_MAX_PAGE_SIZE); + return NULL; + } + + if (atomic_read(&alloc_num_l0) >= L0_MAX_PAGE_NUM) { + oecls_error("alloc_num_l0:%d exceed max num!\n", atomic_read(&alloc_num_l0)); + return NULL; + } + + for (i = 0; i < L0_MAX_PAGE_NUM; i++) { + cur_offset_l0 = (cur_offset_l0 + 1) % L0_MAX_PAGE_NUM; + if (slot_l0[cur_offset_l0] == 0) { + addr = oecls_l0_cache_head + cur_offset_l0 * L0_MAX_PAGE_SIZE; + atomic_inc(&alloc_num_l0); + slot_l0[cur_offset_l0] = 1; + break; + } + } + + if (!addr) + oecls_error("app:%s, pid:%d alloc fail!\n", current->comm, current->pid); + return addr; +} + +void free_to_l0(void *addr) +{ + int offset = 0; + + if (!addr) + return; + + if (!oecls_l0_cache_head) { + kfree(addr); + return; + } + + if (atomic_read(&alloc_num_l0) <= 0) { + oecls_error("alloc_num_l0:%d <= 0!\n", atomic_read(&alloc_num_l0)); + return; + } + + offset = (addr - oecls_l0_cache_head) / L0_MAX_PAGE_SIZE; + if (offset >= L0_MAX_PAGE_NUM) { + oecls_error("app:%s, pid:%d, offset:%d\n", current->comm, current->pid, offset); + return; + } + + slot_l0[offset] = 0; + atomic_dec(&alloc_num_l0); +} + +static int get_node_node(struct vm_area_struct *vma) +{ + struct mempolicy *pol; + nodemask_t *nmask; + int nid; + + nid = get_vma_policy_node(vma, vma->vm_start, GFP_KERNEL, &pol, &nmask); + if (pol->mode == MPOL_BIND || pol->mode == MPOL_PREFERRED_MANY) + nid = first_node(*nmask); + + return nid; +} + +static int oecls_l0_cache_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long cont_size = PAGE_ALIGN(vma->vm_end - vma->vm_start); + struct page *page = NULL; + struct l0_vma_data *data; + int page_cnt, nid; + int ret = 0; + + if ((vma->vm_start % PMD_SIZE) || (vma->vm_end % PMD_SIZE)) { + oecls_error("error vma_start: %#lx, vma_end: %#lx\n", vma->vm_start, vma->vm_end); + return -EINVAL; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + oecls_error("kzalloc l0_vma_data fail!\n"); + return -ENOMEM; + } + + page_cnt = cont_size >> PAGE_SHIFT; + nid = get_node_node(vma); +#ifdef CONFIG_CONTIG_ALLOC + page = alloc_contig_pages(page_cnt, GFP_KERNEL, nid, NULL); +#endif + if (!page) { + ret = -ENOMEM; + oecls_error("alloc_contig_pages fail! page_cnt:%d, nid:%d\n", page_cnt, nid); + goto free_data; + } + +#if IS_ENABLED(CONFIG_HISI_L3T) + ret = l3t_shared_lock(nid, page_to_pfn(page), cont_size); +#endif + if (ret) { + ret = -EOPNOTSUPP; + oecls_error("l3t_shared_lock fail! ret: %d\n", ret); + goto free_page; + } + + data->page = page; + data->size = cont_size; + data->nid = nid; + vma->vm_private_data = data; + return 0; +free_page: + free_contig_range(page_to_pfn(page), page_cnt); +free_data: + kfree(data); + return ret; +} + +void init_oecls_l0_cache(void) +{ + int ret = 0; + struct l0_vma_data *sbkpages; + + oecls_l0_cache = kzalloc(sizeof(*oecls_l0_cache), GFP_KERNEL); + if (!oecls_l0_cache) { + oecls_error("kzalloc oecls_l0_cache fail!\n"); + return; + } + oecls_l0_cache->vm_start = 0; + oecls_l0_cache->vm_end = L0_MAX_PAGE_NUM * L0_MAX_PAGE_SIZE; + + ret = oecls_l0_cache_mmap(NULL, oecls_l0_cache); + if (ret != 0) { + kfree(oecls_l0_cache); + oecls_l0_cache = NULL; + return; + } + sbkpages = (struct l0_vma_data *)(oecls_l0_cache->vm_private_data); + oecls_l0_cache_head = page_to_virt(sbkpages->page); + + atomic_set(&alloc_num_l0, 0); + oecls_debug("l0_cache:%p, l0_cache_head:%p\n", oecls_l0_cache, oecls_l0_cache_head); +} + +void clean_oecls_l0_cache(void) +{ + struct l0_vma_data *sbkpages; + + if (!oecls_l0_cache) + return; + + oecls_debug("alloc_num_l0:%d\n", atomic_read(&alloc_num_l0)); + while (atomic_read(&alloc_num_l0) != 0) + mdelay(1); + + sbkpages = (struct l0_vma_data *)(oecls_l0_cache->vm_private_data); + if (sbkpages) { +#if IS_ENABLED(CONFIG_HISI_L3T) + l3t_shared_unlock(sbkpages->nid, page_to_pfn(sbkpages->page), sbkpages->size); +#endif + free_contig_range(page_to_pfn(sbkpages->page), sbkpages->size >> PAGE_SHIFT); + kfree(sbkpages); + oecls_l0_cache_head = NULL; + } + + kfree(oecls_l0_cache); +} diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c new file mode 100644 index 000000000000..8cf5d02bb5b9 --- /dev/null +++ b/net/oenetcls/oenetcls_main.c @@ -0,0 +1,972 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netdev_features.h> +#include <linux/ethtool.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/rtnetlink.h> +#include "oenetcls.h" + +int oecls_netdev_num; +static struct oecls_netdev_info oecls_netdev_info_table[OECLS_MAX_NETDEV_NUM]; + +int oecls_numa_num; +static struct oecls_numa_info oecls_numa_info_table[OECLS_MAX_NUMA_NUM]; + +int debug; +module_param(debug, int, 0644); +MODULE_PARM_DESC(debug, "debug switch"); + +static int mode; +module_param(mode, int, 0444); +MODULE_PARM_DESC(mode, "mode, default 0"); + +static char ifname[64] = { 0 }; +module_param_string(ifname, ifname, sizeof(ifname), 0444); +MODULE_PARM_DESC(ifname, "ifname"); + +static char appname[64] = "redis-server"; +module_param_string(appname, appname, sizeof(appname), 0644); +MODULE_PARM_DESC(appname, "appname, default redis-server"); + +int match_ip_flag = 1; +module_param(match_ip_flag, int, 0644); +MODULE_PARM_DESC(match_ip_flag, "match ip flag"); + +static int strategy; +module_param(strategy, int, 0444); +MODULE_PARM_DESC(strategy, "strategy, default 0"); + +static bool check_params(void) +{ + if (mode != 0 && mode != 1) + return false; + + if (strlen(ifname) == 0) + return false; + + return true; +} + +int check_appname(char *task_name) +{ + char *start = appname; + char *end; + + if (!strlen(appname)) + return 0; + + // support appname: app1#app2#appN + while (*start != '\0') { + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + if (!end) { + if (!strncmp(task_name, start, strlen(start))) + return 0; + break; + } + + if (!strncmp(task_name, start, end - start)) + return 0; + start = end + 1; + } + return -EOPNOTSUPP; +} + +static u32 __ethtool_get_flags(struct net_device *dev) +{ + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) + flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) + flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) + flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) + flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) + flags |= ETH_FLAG_RXHASH; + + return flags; +} + +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + netdev_features_t features = 0, changed; + + if (data & ~ETH_ALL_FLAGS) + return -EINVAL; + + if (data & ETH_FLAG_LRO) + features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_RX; + if (data & ETH_FLAG_TXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_TX; + if (data & ETH_FLAG_NTUPLE) + features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) + features |= NETIF_F_RXHASH; + + /* allow changing only bits set in hw_features */ + changed = (features ^ dev->features) & ETH_ALL_FEATURES; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (features & changed); + + __netdev_update_features(dev); + + return 0; +} + +static void ethtool_rxnfc_copy_to_user(void *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + memcpy_r(useraddr, rxnfc, size); + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + + if (rule_buf) + memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32)); +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + int rc; + + if (!dev->ethtool_ops->set_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (rc) + return rc; + + if (cmd == ETHTOOL_SRXCLSRLINS) + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); + + return 0; +} + +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + memcpy_r(&info, useraddr, info_size); + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info.flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info.cmd != cmd) + return -EINVAL; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kcalloc(info.rule_cnt, sizeof(u32), + GFP_KERNEL); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); +err_out: + kfree(rule_buf); + + return ret; +} + +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + memcpy_r(useraddr, &channels, sizeof(channels)); + return 0; +} + +static int ethtool_get_value(struct net_device *dev, char *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { .cmd = cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + memcpy_r(useraddr, &edata, sizeof(edata)); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + memcpy_r(&edata, useraddr, sizeof(edata)); + + return actor(dev, edata.data); +} + +static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + void *useraddr = ifr->ifr_data; + u32 ethcmd, sub_cmd; + int rc; + netdev_features_t old_features; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + memcpy_r(ðcmd, useraddr, sizeof(ethcmd)); + + if (ethcmd == ETHTOOL_PERQUEUE) + memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)); + else + sub_cmd = ethcmd; + + /* Allow some commands to be done by anyone */ + switch (sub_cmd) { + case ETHTOOL_GFLAGS: + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GCHANNELS: + break; + default: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + } + + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) + return rc; + } + old_features = dev->features; + + switch (ethcmd) { + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + __ethtool_get_flags); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); + break; + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + + return rc; +} + +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd) +{ + struct ifreq ifr = {0}; + int ret; + + strncpy(ifr.ifr_name, ctx->netdev, sizeof(ctx->netdev)); + ifr.ifr_data = cmd; + + rtnl_lock(); + ret = dev_ethtool_kern(&init_net, &ifr); + rtnl_unlock(); + + return ret; +} + +struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index) +{ + if (index >= OECLS_MAX_NETDEV_NUM) + return NULL; + return &oecls_netdev_info_table[index]; +} + +static struct oecls_netdev_info *alloc_oecls_netdev_info(void) +{ + if (oecls_netdev_num >= OECLS_MAX_NETDEV_NUM) + return NULL; + + return &oecls_netdev_info_table[oecls_netdev_num++]; +} + +static bool check_irq_name(const char *irq_name, struct oecls_netdev_info *oecls_dev) +{ + if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx")) + return false; + + if (strstr(irq_name, oecls_dev->dev_name)) + return true; + + if (oecls_dev->netdev->dev.parent && + strstr(irq_name, dev_name(oecls_dev->netdev->dev.parent))) + return true; + + return false; +} + +static void get_netdev_queue_info(struct oecls_netdev_info *oecls_dev) +{ + struct oecls_netdev_queue_info *rxq_info; + struct irq_desc *desc; + int irq; + + for_each_irq_desc(irq, desc) { + if (!desc->action) + continue; + if (!desc->action->name) + continue; + if (!check_irq_name(desc->action->name, oecls_dev)) + continue; + + oecls_debug("irq=%d, [%s], rxq_id=%d\n", irq, desc->action->name, + oecls_dev->rxq_num); + + if (oecls_dev->rxq_num >= OECLS_MAX_RXQ_NUM_PER_DEV) + break; + rxq_info = &oecls_dev->rxq[oecls_dev->rxq_num++]; + rxq_info->irq = irq; + } +} + +static int oecls_filter_enable(const char *dev_name, bool *old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + int ret; + + strncpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (eval.data & ETH_FLAG_NTUPLE) { + *old_state = true; + oecls_debug("%s ntuple is already on\n", dev_name); + return 0; + } + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + eval.data |= ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + + // Get ntuple feature + eval.cmd = ETHTOOL_GFLAGS; + eval.data = 0; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (!(eval.data & ETH_FLAG_NTUPLE)) { + oecls_error("enable ntuple feature fail!\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static void oecls_filter_restore(const char *dev_name, bool old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + bool cur_filter_state; + int ret; + + strncpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return; + } + + cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false; + if (cur_filter_state == old_state) + return; + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + if (old_state) + eval.data |= ETH_FLAG_NTUPLE; + else + eval.data &= ~ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return; + } +} + +static int init_single_oecls_dev(char *if_name, unsigned int length) +{ + struct oecls_netdev_info *oecls_dev; + char dev_name[IFNAMSIZ] = { 0 }; + struct net_device *netdev; + int cpy_len = length < IFNAMSIZ ? length : IFNAMSIZ; + bool old_state = false; + int ret; + + strncpy(dev_name, if_name, cpy_len); + netdev = dev_get_by_name(&init_net, dev_name); + if (!netdev) { + oecls_error("dev [%s] is not exist!\n", dev_name); + return -ENODEV; + } + + if (!(netdev->flags & IFF_UP)) { + ret = -ENETDOWN; + oecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags); + goto out; + } + + if (netdev->flags & IFF_LOOPBACK) { + ret = -EOPNOTSUPP; + oecls_error("Do not support loopback.\n"); + goto out; + } + + ret = oecls_filter_enable(dev_name, &old_state); + if (ret) { + oecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); + goto out; + } + + oecls_dev = alloc_oecls_netdev_info(); + if (!oecls_dev) { + ret = -ENOMEM; + oecls_filter_restore(dev_name, old_state); + oecls_error("alloc oecls_dev fail! oecls_netdev_num:%d\n", oecls_netdev_num); + goto out; + } + + memcpy_r(oecls_dev->dev_name, dev_name, IFNAMSIZ); + oecls_dev->old_filter_state = old_state; + oecls_dev->netdev = netdev; + get_netdev_queue_info(oecls_dev); + return 0; + +out: + dev_put(netdev); + return ret; +} + +static void clean_oecls_netdev_info(void) +{ + struct oecls_netdev_info *oecls_dev; + struct net_device *netdev; + int devid; + + for_each_oecls_netdev(devid, oecls_dev) { + oecls_filter_restore(oecls_dev->dev_name, oecls_dev->old_filter_state); + netdev = oecls_dev->netdev; + if (netdev) { + oecls_dev->netdev = NULL; + dev_put(netdev); + } + } + + oecls_netdev_num = 0; +} + +static int init_oecls_netdev_info(char *netdev_str) +{ + char *start = netdev_str; + char *end; + int err = -ENODEV; + + while (*start != '\0') { + // skip start # + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + // find the last ifname + if (!end) { + err = init_single_oecls_dev(start, strlen(start)); + break; + } + + err = init_single_oecls_dev(start, end - start); + if (err) + break; + start = end + 1; + } + + return err; +} + +struct oecls_numa_info *get_oecls_numa_info(unsigned int nid) +{ + if (nid >= OECLS_MAX_NUMA_NUM) + return NULL; + return &oecls_numa_info_table[nid]; +} + +static void clean_oecls_numa_info(void) +{ + oecls_numa_num = 0; +} + +static void init_numa_avail_cpus(int nid, struct oecls_numa_info *numa_info) +{ + int cpu; + + oecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)), + cpumask_pr_args(cpumask_of_node(nid))); + + bitmap_zero(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); + for_each_cpu(cpu, cpumask_of_node(nid)) { + if (cpu >= OECLS_MAX_CPU_NUM) + return; + set_bit(cpu, numa_info->avail_cpus); + } +} + +static void init_numa_rxq_bitmap(int nid, struct oecls_numa_info *numa_info) +{ + struct oecls_numa_bound_dev_info *bound_dev; + struct oecls_netdev_info *oecls_dev; + int bound_rxq_num; + int rxq_id; + int devid; + int cpu; + + for_each_oecls_netdev(devid, oecls_dev) { + bound_rxq_num = 0; + bound_dev = &numa_info->bound_dev[devid]; + bitmap_zero(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); + + for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { + cpu = oecls_dev->rxq[rxq_id].affinity_cpu; + if (cpu_to_node(cpu) == nid) { + set_bit(rxq_id, bound_dev->bitmap_rxq); + bound_rxq_num++; + } + } + + oecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bit_num:%d, bitmap_rxq:%*pbl\n", + nid, devid, oecls_dev->dev_name, oecls_dev->rxq_num, + bound_rxq_num, OECLS_MAX_RXQ_NUM_PER_DEV, bound_dev->bitmap_rxq); + } +} + +int alloc_rxq_id(int nid, int devid) +{ + struct oecls_numa_bound_dev_info *bound_dev; + struct oecls_numa_info *numa_info; + int rxq_id; + + numa_info = get_oecls_numa_info(nid); + if (!numa_info) { + oecls_error("error nid:%d\n", nid); + return -EINVAL; + } + + if (devid >= OECLS_MAX_NETDEV_NUM) { + oecls_error("error bound_dev index:%d\n", devid); + return -EINVAL; + } + bound_dev = &numa_info->bound_dev[devid]; + + rxq_id = find_first_bit(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); + if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) { + oecls_error("error rxq_id:%d\n", rxq_id); + return -EINVAL; + } + + clear_bit(rxq_id, bound_dev->bitmap_rxq); + oecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); + return rxq_id; +} + +void free_rxq_id(int nid, int devid, int rxq_id) +{ + struct oecls_numa_bound_dev_info *bound_dev; + struct oecls_numa_info *numa_info; + + numa_info = get_oecls_numa_info(nid); + if (!numa_info) { + oecls_error("error nid:%d\n", nid); + return; + } + + if (devid >= OECLS_MAX_NETDEV_NUM) { + oecls_error("error bound_dev index:%d\n", devid); + return; + } + bound_dev = &numa_info->bound_dev[devid]; + + if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) { + oecls_error("error rxq_id:%d\n", rxq_id); + return; + } + + if (test_bit(rxq_id, bound_dev->bitmap_rxq)) { + oecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id); + return; + } + + set_bit(rxq_id, bound_dev->bitmap_rxq); + oecls_debug("free nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); +} + +static void init_oecls_numa_info(void) +{ + struct oecls_numa_info *numa_info; + unsigned int numa_num; + int nid; + + numa_num = num_online_nodes(); + if (numa_num > OECLS_MAX_NUMA_NUM) { + oecls_error("online numa num:%d is too much!\n", numa_num); + numa_num = OECLS_MAX_NUMA_NUM; + } + oecls_numa_num = numa_num; + oecls_debug("set oecls_numa_num=%d\n", numa_num); + + for_each_oecls_numa(nid, numa_info) + init_numa_avail_cpus(nid, numa_info); +} + +static int alloc_available_cpu(int nid, struct oecls_numa_info *numa_info) +{ + int cpu; + + cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); + if (cpu >= OECLS_MAX_CPU_NUM) { + oecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu); + return -1; + } + + clear_bit(cpu, numa_info->avail_cpus); + return cpu; +} + +static void add_netdev_irq_affinity_cpu(struct oecls_netdev_info *oecls_dev, int rxq_id, int cpu) +{ + struct oecls_netdev_queue_info *rxq_info; + + if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) + return; + + rxq_info = &oecls_dev->rxq[rxq_id]; + rxq_info->affinity_cpu = cpu; +} + +static void config_affinity_strategy_default(struct oecls_netdev_info *oecls_dev) +{ + struct oecls_numa_info *numa_info; + int rxq_num = oecls_dev->rxq_num; + int rxq_per_numa = rxq_num / oecls_numa_num; + int remain = rxq_num - rxq_per_numa * oecls_numa_num; + int numa_rxq_id; + int rxq_id; + int nid; + int cpu; + + oecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", oecls_dev->dev_name, + rxq_num, rxq_per_numa, remain); + + // average config rxq to every numa + for_each_oecls_numa(nid, numa_info) { + for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) { + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * nid + numa_rxq_id; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu); + } + } + + if (!remain) + return; + + // config remain rxq to every numa + numa_rxq_id = 0; + for_each_oecls_numa(nid, numa_info) { + if (numa_rxq_id >= remain) + break; + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * oecls_numa_num + numa_rxq_id; + numa_rxq_id++; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu); + } +} + +static void config_affinity_strategy_cluster(struct oecls_netdev_info *oecls_dev) +{ + int cluster_cpu_num = 8; + int cluster_num = num_online_cpus() / cluster_cpu_num; + int cluster_cpu_id = 0; + int rxq_id = 0; + int cluster; + int cpu; + + // average config rxq to every cluster + while (rxq_id < oecls_dev->rxq_num) { + for (cluster = 0; cluster < cluster_num; cluster++) { + cpu = cluster * cluster_cpu_num + cluster_cpu_id; + if (rxq_id >= oecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); + } + cluster_cpu_id++; + } +} + +static void config_affinity_strategy_16cores(struct oecls_netdev_info *oecls_dev) +{ + struct oecls_numa_info *numa_info; + int numa_start_cpu; + int numa_cpu_id; + int rxq_id = 0; + int nid; + int cpu; + + // only use 16 cores of one numa + for_each_oecls_numa(nid, numa_info) { + numa_start_cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); + for (numa_cpu_id = 0; numa_cpu_id < 16; numa_cpu_id++) { + cpu = numa_start_cpu + numa_cpu_id; + + if (rxq_id >= oecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); + } + } +} + +static void config_affinity_strategy(void) +{ + struct oecls_netdev_info *oecls_dev; + int devid; + + for_each_oecls_netdev(devid, oecls_dev) { + switch (strategy) { + case 0: + config_affinity_strategy_default(oecls_dev); + break; + case 1: + config_affinity_strategy_cluster(oecls_dev); + break; + case 2: + config_affinity_strategy_16cores(oecls_dev); + break; + default: + config_affinity_strategy_default(oecls_dev); + break; + } + } +} + +static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu) +{ + int err = 0; + + err = irq_set_affinity(irq, get_cpu_mask(cpu)); + oecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err); +} + +static void enable_affinity_strategy(void) +{ + struct oecls_netdev_queue_info *rxq_info; + struct oecls_netdev_info *oecls_dev; + int rxq_id; + int devid; + + for_each_oecls_netdev(devid, oecls_dev) { + for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { + rxq_info = &oecls_dev->rxq[rxq_id]; + irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu); + } + } +} + +static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id, + const struct cpumask *cpu_mask) +{ + int err = 0; + + err = netif_set_xps_queue(netdev, cpu_mask, rxq_id); + oecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id, + cpumask_pr_args(cpu_mask), err); +} + +static void set_netdev_xps_queue(bool enable) +{ + const struct cpumask clear_mask = { 0 }; + struct oecls_netdev_info *oecls_dev; + const struct cpumask *cpu_mask; + int rxq_id; + int devid; + int cpu; + int nid; + + for_each_oecls_netdev(devid, oecls_dev) { + for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { + cpu = oecls_dev->rxq[rxq_id].affinity_cpu; + nid = cpu_to_node(cpu); + if (enable) + cpu_mask = cpumask_of_node(nid); + else + cpu_mask = &clear_mask; + + netif_set_xps_queue_wrapper(oecls_dev->netdev, rxq_id, cpu_mask); + } + } +} + +static __init int oecls_init(void) +{ + struct oecls_numa_info *numa_info; + int nid; + int err; + + oecls_debug("[init] mode=%d, ifname=[%s]\n", mode, ifname); + if (!check_params()) + return -EINVAL; + + init_oecls_l0_cache(); + init_oecls_numa_info(); + err = init_oecls_netdev_info(ifname); + if (err) + goto out; + + // Set irq affinity + config_affinity_strategy(); + enable_affinity_strategy(); + + // Calculate rxq bounded to one numa + for_each_oecls_numa(nid, numa_info) + init_numa_rxq_bitmap(nid, numa_info); + +#ifdef CONFIG_XPS + set_netdev_xps_queue(true); +#endif + + if (mode == 0) + oecls_ntuple_res_init(); + else + oecls_flow_res_init(); + + return 0; +out: + clean_oecls_netdev_info(); + clean_oecls_numa_info(); + clean_oecls_l0_cache(); + return err; +} + +static __exit void oecls_exit(void) +{ + oecls_debug("[exit] mode=%d\n", mode); + if (mode == 0) + oecls_ntuple_res_clean(); + else + oecls_flow_res_clean(); + +#ifdef CONFIG_XPS + set_netdev_xps_queue(false); +#endif + + clean_oecls_netdev_info(); + clean_oecls_numa_info(); + clean_oecls_l0_cache(); +} + +module_init(oecls_init); +module_exit(oecls_exit); + +MODULE_DESCRIPTION("oenetcls"); +MODULE_LICENSE("GPL v2"); diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c new file mode 100644 index 000000000000..38d1f5df6ff1 --- /dev/null +++ b/net/oenetcls/oenetcls_ntuple.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/inetdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/inet.h> +#include <linux/jhash.h> +#include <net/sock.h> +#include <trace/hooks/oenetcls.h> +#include "oenetcls.h" + +struct oecls_sk_rule_list oecls_sk_rules; + +static void init_oecls_sk_rules(void) +{ + unsigned int i; + + for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) + INIT_HLIST_HEAD(oecls_sk_rules.hash + i); + mutex_init(&oecls_sk_rules.mutex); +} + +static struct hlist_head *oecls_sk_rule_hash(u32 dip4, u16 dport) +{ + return oecls_sk_rules.hash + (jhash_2words(dip4, dport, 0) & OECLS_SK_RULE_HASHMASK); +} + +static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, + int ruleid, int nid) +{ + struct hlist_head *hlist = oecls_sk_rule_hash(dip4, dport); + struct oecls_sk_rule *rule; + + rule = alloc_from_l0(sizeof(struct oecls_sk_rule)); + if (!rule) + return; + oecls_debug("alloc rule=%p\n", rule); + + rule->sk = sk; + rule->dip4 = dip4; + rule->dport = dport; + rule->devid = devid; + rule->action = action; + rule->ruleid = ruleid; + rule->nid = nid; + hlist_add_head(&rule->node, hlist); +} + +static void del_sk_rule(struct oecls_sk_rule *rule) +{ + hlist_del_init(&rule->node); + oecls_debug("del rule=%p\n", rule); + free_to_l0(rule); +} + +static struct oecls_sk_rule *get_sk_rule(int devid, u32 dip4, u16 dport) +{ + struct hlist_head *hlist = oecls_sk_rule_hash(dip4, dport); + struct oecls_sk_rule *rule = NULL; + + hlist_for_each_entry(rule, hlist, node) { + if (rule->devid == devid && rule->dip4 == dip4 && rule->dport == dport) + break; + } + return rule; +} + +static bool reuseport_check(int devid, u32 dip4, u16 dport) +{ + return !!get_sk_rule(devid, dip4, dport); +} + +static u32 get_first_ip4_addr(struct net *net) +{ + struct in_device *in_dev; + struct net_device *dev; + struct in_ifaddr *ifa; + u32 dip4 = 0; + + rtnl_lock(); + rcu_read_lock(); + for_each_netdev(net, dev) { + if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) + continue; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!strcmp(dev->name, ifa->ifa_label)) { + dip4 = ifa->ifa_local; + oecls_debug("dev: %s, dip4: 0x%x\n", dev->name, dip4); + goto out; + } + } + } +out: + rcu_read_unlock(); + rtnl_unlock(); + return dip4; +} + +static void get_sk_rule_addr(struct sock *sk, u32 *dip4, u16 *dport) +{ + *dport = htons(sk->sk_num); + + if (!match_ip_flag) { + *dip4 = 0; + return; + } + + if (sk->sk_rcv_saddr) + *dip4 = sk->sk_rcv_saddr; + else + *dip4 = get_first_ip4_addr(sock_net(sk)); +} + +static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_SRXCLSRLDEL; + nfccmd.fs.location = loc; + err = send_ethtool_ioctl(ctx, &nfccmd); + if (err < 0) + oecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc); + return err; +} + +static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc) +{ + if (loc >= rmgr->size) { + oecls_error("rmgr: Location out of range\n"); + return -1; + } + + set_bit(loc, rmgr->slot); + return 0; +} + +static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp) +{ + __u32 loc, slot_num; + + if (rmgr->driver_select) + return 0; + + loc = rmgr->size - 1; + slot_num = loc / BITS_PER_LONG; + if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) { + loc -= 1 + (loc % BITS_PER_LONG); + slot_num--; + } + + while (loc < rmgr->size && !~(rmgr->slot[slot_num])) { + loc -= BITS_PER_LONG; + slot_num--; + } + + while (loc < rmgr->size && test_bit(loc, rmgr->slot)) + loc--; + + if (loc < rmgr->size) { + fsp->location = loc; + return rmgr_ins(rmgr, loc); + } + + return -1; +} + +static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_GRXCLSRLCNT; + nfccmd.data = 0; + err = send_ethtool_ioctl(ctx, &nfccmd); + *count = nfccmd.rule_cnt; + if (driver_select) + *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL); + if (err < 0) + oecls_debug("rxclass: Cannot get RX class rule count\n"); + + return err; +} + +static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr) +{ + struct ethtool_rxnfc *nfccmd; + __u32 *rule_locs; + int i, err = 0; + + memset(rmgr, 0, sizeof(*rmgr)); + err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select); + if (err < 0) + return err; + + if (rmgr->driver_select) + return err; + + nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC); + if (!nfccmd) { + oecls_error("rmgr: Cannot allocate memory for RX class rule locations\n"); + err = -ENOMEM; + goto out; + } + + nfccmd->cmd = ETHTOOL_GRXCLSRLALL; + nfccmd->rule_cnt = rmgr->n_rules; + err = send_ethtool_ioctl(ctx, nfccmd); + if (err < 0) { + oecls_debug("rmgr: Cannot get RX class rules\n"); + goto out; + } + + rmgr->size = nfccmd->data; + if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) { + oecls_error("rmgr: Invalid RX class rules table size\n"); + err = -EINVAL; + goto out; + } + + rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC); + if (!rmgr->slot) { + oecls_error("rmgr: Cannot allocate memory for RX class rules\n"); + err = -ENOMEM; + goto out; + } + + rule_locs = nfccmd->rule_locs; + for (i = 0; i < rmgr->n_rules; i++) { + err = rmgr_ins(rmgr, rule_locs[i]); + if (err < 0) + break; + } + +out: + kfree(nfccmd); + return err; +} + +static void rmgr_cleanup(struct rmgr_ctrl *rmgr) +{ + kfree(rmgr->slot); + rmgr->slot = NULL; + rmgr->size = 0; +} + +static int rmgr_set_location(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp) +{ + struct rmgr_ctrl rmgr; + int ret; + + ret = rmgr_init(ctx, &rmgr); + if (ret < 0) + goto out; + + ret = rmgr_find_empty_slot(&rmgr, fsp); +out: + rmgr_cleanup(&rmgr); + return ret; +} + +static int rxclass_rule_ins(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp, u32 rss_context) +{ + struct ethtool_rxnfc nfccmd; + u32 loc = fsp->location; + int ret; + + if (loc & RX_CLS_LOC_SPECIAL) { + ret = rmgr_set_location(ctx, fsp); + if (ret < 0) + return ret; + } + + nfccmd.cmd = ETHTOOL_SRXCLSRLINS; + nfccmd.rss_context = rss_context; + nfccmd.fs = *fsp; + ret = send_ethtool_ioctl(ctx, &nfccmd); + if (ret < 0) { + oecls_debug("Can not insert the clasification rule\n"); + return ret; + } + + if (loc & RX_CLS_LOC_SPECIAL) + oecls_debug("Added rule with ID %d\n", nfccmd.fs.location); + + return 0; +} + +static void flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp, + struct ethtool_rx_ntuple_flow_spec *ntuple) +{ + int i; + + memset(ntuple, ~0, sizeof(*ntuple)); + ntuple->flow_type = fsp->flow_type; + ntuple->action = fsp->ring_cookie; + memcpy_r(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u)); + memcpy_r(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u)); + for (i = 0; i < sizeof(ntuple->m_u); i++) + ntuple->m_u.hdata[i] ^= 0xFF; + ntuple->flow_type &= ~FLOW_EXT; +} + +static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fsp) +{ + struct ethtool_rx_ntuple ntuplecmd; + struct ethtool_value eval; + int ret = 0; + + flow_spec_to_ntuple(fsp, &ntuplecmd.fs); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(ctx, &eval); + if (ret || !(eval.data & ETH_FLAG_NTUPLE)) + return -1; + + ntuplecmd.cmd = ETHTOOL_SRXNTUPLE; + ret = send_ethtool_ioctl(ctx, &ntuplecmd); + if (ret) + oecls_debug("Cannot add new rule via N-tuple, ret:%d\n", ret); + + return ret; +} + +static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del) +{ + struct ethtool_rx_flow_spec *fsp, rx_rule_fs; + u32 rss_context = 0; + int ret; + + oecls_debug("is_del:%d netdev:%s, dip4:%pI4, dport:%d, action:%d, ruleid:%u, del_ruleid:%u\n", + is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, ctx->ruleid, + ctx->del_ruleid); + + if (is_del) + return rxclass_rule_del(ctx, ctx->del_ruleid); + + ctx->ret_loc = -1; + + fsp = &rx_rule_fs; + memset(fsp, 0, sizeof(*fsp)); + fsp->flow_type = TCP_V4_FLOW; + fsp->location = RX_CLS_LOC_ANY; + fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4; + fsp->h_u.tcp_ip4_spec.pdst = ctx->dport; + if (ctx->dip4) + fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL; + fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL; + if (ctx->ruleid) + fsp->location = ctx->ruleid; + fsp->ring_cookie = ctx->action; + + ret = do_srxntuple(ctx, &rx_rule_fs); + if (!ret) + return 0; + + ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context); + if (!ret) + ctx->ret_loc = rx_rule_fs.location; + return ret; +} + +static void del_ntuple_rule(struct sock *sk) +{ + struct oecls_netdev_info *oecls_dev; + struct cmd_context ctx = { 0 }; + struct oecls_sk_rule *rule; + int devid; + u16 dport; + u32 dip4; + int err; + + get_sk_rule_addr(sk, &dip4, &dport); + + mutex_lock(&oecls_sk_rules.mutex); + for_each_oecls_netdev(devid, oecls_dev) { + strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + rule = get_sk_rule(devid, dip4, dport); + if (!rule) { + oecls_debug("rule not found! sk:%p, devid:%d, dip4:0x%x, dport:%d\n", sk, + devid, dip4, dport); + continue; + } + + // Config Ntuple rule to dev + ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx, true); + if (err) { + oecls_error("del sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", + sk, rule->nid, devid, rule->action, rule->ruleid, err); + } + + // Free the bound queue + free_rxq_id(rule->nid, devid, rule->action); + + // Delete sk rule + del_sk_rule(rule); + } + mutex_unlock(&oecls_sk_rules.mutex); +} + +static void add_ntuple_rule(struct sock *sk) +{ + struct oecls_netdev_info *oecls_dev; + struct cmd_context ctx = { 0 }; + int cpu = smp_processor_id(); + int nid = cpu_to_node(cpu); + int rxq_id; + int devid; + int err; + + if (check_appname(current->comm)) + return; + get_sk_rule_addr(sk, &ctx.dip4, &ctx.dport); + + mutex_lock(&oecls_sk_rules.mutex); + for_each_oecls_netdev(devid, oecls_dev) { + strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + if (reuseport_check(devid, ctx.dip4, ctx.dport)) { + oecls_error("dip4:0x%x, dport:%d reuse!\n", ctx.dip4, ctx.dport); + continue; + } + + // Calculate the bound queue + rxq_id = alloc_rxq_id(nid, devid); + if (rxq_id < 0) + continue; + + // Config Ntuple rule to dev + ctx.action = (u16)rxq_id; + err = cfg_ethtool_rule(&ctx, false); + if (err) { + oecls_error("add sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", + sk, nid, devid, ctx.action, ctx.ret_loc, err); + continue; + } + + // Add sk rule + add_sk_rule(devid, ctx.dip4, ctx.dport, sk, ctx.action, ctx.ret_loc, nid); + } + mutex_unlock(&oecls_sk_rules.mutex); +} + +static void ethtool_cfg_rxcls(void *data, struct sock *sk, int is_del) +{ + if (sk->sk_state != TCP_LISTEN) + return; + + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return; + + oecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, ip:0x%x, port:0x%x\n", smp_processor_id(), + current->comm, sk, is_del, sk->sk_rcv_saddr, sk->sk_num); + + if (is_del) + del_ntuple_rule(sk); + else + add_ntuple_rule(sk); +} + +static void clean_oecls_sk_rules(void) +{ + struct oecls_netdev_info *oecls_dev; + struct cmd_context ctx = { 0 }; + struct oecls_sk_rule *rule; + struct hlist_head *hlist; + struct hlist_node *n; + unsigned int i; + int err; + + mutex_lock(&oecls_sk_rules.mutex); + for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) { + hlist = &oecls_sk_rules.hash[i]; + + hlist_for_each_entry_safe(rule, n, hlist, node) { + oecls_dev = get_oecls_netdev_info(rule->devid); + if (!oecls_dev) + continue; + strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx, true); + oecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, + rule->devid, rule->action, rule->ruleid, err); + + hlist_del(&rule->node); + oecls_debug("clean rule=%p\n", rule); + free_to_l0(rule); + } + } + mutex_unlock(&oecls_sk_rules.mutex); +} + +void oecls_ntuple_res_init(void) +{ + init_oecls_sk_rules(); + register_trace_ethtool_cfg_rxcls(ðtool_cfg_rxcls, NULL); +} + +void oecls_ntuple_res_clean(void) +{ + unregister_trace_ethtool_cfg_rxcls(ðtool_cfg_rxcls, NULL); + clean_oecls_sk_rules(); +} -- 2.34.1

反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/16553 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/MHA... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/16553 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/MHA...
participants (2)
-
patchwork bot
-
Wang Liang