[PATCH] Add oenetcls support for velinux 5.15

Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- ...roduce-oenetcls-for-network-optimiza.patch | 2812 +++++++++++++++++ 1 file changed, 2812 insertions(+) create mode 100644 0001-net-oenetcls-introduce-oenetcls-for-network-optimiza.patch diff --git a/0001-net-oenetcls-introduce-oenetcls-for-network-optimiza.patch b/0001-net-oenetcls-introduce-oenetcls-for-network-optimiza.patch new file mode 100644 index 0000000..a551070 --- /dev/null +++ b/0001-net-oenetcls-introduce-oenetcls-for-network-optimiza.patch @@ -0,0 +1,2812 @@ +From 0d83dcfc81e73470f256d1a48511706376944d11 Mon Sep 17 00:00:00 2001 +From: Yue Haibing <yuehaibing@huawei.com> +Date: Tue, 5 Aug 2025 16:05:52 +0800 +Subject: [PATCH] net/oenetcls: introduce oenetcls for network optimization + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/ICBFCS +CVE: NA + +-------------------------------- + +This introduces a kind of network optimization method named oenetcls. It +can configure the ntuple rule, and bind interrupt to the netdev queue +automatically. + +Signed-off-by: Yue Haibing <yuehaibing@huawei.com> +Signed-off-by: Wang Liang <wangliang74@huawei.com> +Signed-off-by: Liu Jian <liujian56@huawei.com> +Signed-off-by: yuelg <yuelg@chinaunicom.cn> +--- + include/linux/netdevice.h | 3 + + include/linux/oenetcls.h | 80 +++ + kernel/irq/irqdesc.c | 2 +- + net/Kconfig | 1 + + net/Makefile | 1 + + net/core/dev.c | 19 + + net/ipv4/af_inet.c | 6 + + net/ipv4/tcp.c | 9 + + net/oenetcls/Kconfig | 7 + + net/oenetcls/Makefile | 7 + + net/oenetcls/asmdefs.h | 61 ++ + net/oenetcls/memcpy-sve.S | 157 +++++ + net/oenetcls/oenetcls.h | 177 ++++++ + net/oenetcls/oenetcls_flow.c | 406 ++++++++++++ + net/oenetcls/oenetcls_main.c | 1075 ++++++++++++++++++++++++++++++++ + net/oenetcls/oenetcls_ntuple.c | 573 +++++++++++++++++ + 16 files changed, 2583 insertions(+), 1 deletion(-) + create mode 100644 include/linux/oenetcls.h + create mode 100644 net/oenetcls/Kconfig + create mode 100644 net/oenetcls/Makefile + create mode 100644 net/oenetcls/asmdefs.h + create mode 100644 net/oenetcls/memcpy-sve.S + create mode 100644 net/oenetcls/oenetcls.h + create mode 100644 net/oenetcls/oenetcls_flow.c + create mode 100644 net/oenetcls/oenetcls_main.c + create mode 100644 net/oenetcls/oenetcls_ntuple.c + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index cc1f14f3c..559e59e6b 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -766,6 +766,9 @@ struct netdev_rx_queue { + struct xsk_buff_pool *pool; + #endif + struct file __rcu *dmabuf_pages; ++#if IS_ENABLED(CONFIG_OENETCLS) ++ void __rcu *oecls_ftb; ++#endif + } ____cacheline_aligned_in_smp; + + struct page * +diff --git a/include/linux/oenetcls.h b/include/linux/oenetcls.h +new file mode 100644 +index 000000000..cf57fc12e +--- /dev/null ++++ b/include/linux/oenetcls.h +@@ -0,0 +1,80 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef _LINUX_OENETCLS_H ++#define _LINUX_OENETCLS_H ++ ++struct oecls_hook_ops { ++ void (*oecls_cfg_rxcls)(struct sock *sk, int is_del); ++ void (*oecls_flow_update)(struct sock *sk); ++ void (*oecls_set_cpu)(struct sk_buff *skb); ++ bool (*oecls_timeout)(struct net_device *dev, u16 rxq_index, ++ u32 flow_id, u16 filter_id); ++}; ++ ++extern const struct oecls_hook_ops __rcu *oecls_ops; ++ ++static inline void oenetcls_cfg_rxcls(struct sock *sk, int is_del) ++{ ++ const struct oecls_hook_ops *ops; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(oecls_ops); ++ if (ops && ops->oecls_cfg_rxcls) ++ ops->oecls_cfg_rxcls(sk, is_del); ++ rcu_read_unlock(); ++} ++ ++static inline void oenetcls_flow_update(struct sock *sk) ++{ ++ const struct oecls_hook_ops *ops; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(oecls_ops); ++ if (ops && ops->oecls_flow_update) ++ ops->oecls_flow_update(sk); ++ rcu_read_unlock(); ++} ++ ++static inline void oenetcls_skb_set_cpu(struct sk_buff *skb) ++{ ++ const struct oecls_hook_ops *ops; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(oecls_ops); ++ if (ops && ops->oecls_set_cpu) ++ ops->oecls_set_cpu(skb); ++ rcu_read_unlock(); ++} ++ ++static inline void oenetcls_skblist_set_cpu(struct list_head *head) ++{ ++ const struct oecls_hook_ops *ops; ++ struct sk_buff *skb, *next; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(oecls_ops); ++ if (ops && ops->oecls_set_cpu) { ++ list_for_each_entry_safe(skb, next, head, list) ++ ops->oecls_set_cpu(skb); ++ } ++ rcu_read_unlock(); ++} ++ ++static inline bool oenetcls_may_expire_flow(struct net_device *dev, ++ u16 rxq_index, u32 flow_id, ++ u16 filter_id, bool *expire) ++{ ++ const struct oecls_hook_ops *ops; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(oecls_ops); ++ if (ops && ops->oecls_timeout) { ++ *expire = ops->oecls_timeout(dev, rxq_index, flow_id, filter_id); ++ rcu_read_unlock(); ++ return true; ++ } ++ rcu_read_unlock(); ++ ++ return false; ++} ++ ++#endif /* _LINUX_OENETCLS_H */ +diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c +index 8202d4a99..d3db25d28 100644 +--- a/kernel/irq/irqdesc.c ++++ b/kernel/irq/irqdesc.c +@@ -366,7 +366,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) + { + return radix_tree_lookup(&irq_desc_tree, irq); + } +-#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE ++#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_OENETCLS) + EXPORT_SYMBOL_GPL(irq_to_desc); + #endif + +diff --git a/net/Kconfig b/net/Kconfig +index dc8451e75..626d27bf7 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -72,6 +72,7 @@ source "net/xfrm/Kconfig" + source "net/iucv/Kconfig" + source "net/smc/Kconfig" + source "net/xdp/Kconfig" ++source "net/oenetcls/Kconfig" + + config INET + bool "TCP/IP networking" +diff --git a/net/Makefile b/net/Makefile +index 6a62e5b27..eade2be8e 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -78,3 +78,4 @@ obj-$(CONFIG_NET_NCSI) += ncsi/ + obj-$(CONFIG_XDP_SOCKETS) += xdp/ + obj-$(CONFIG_MPTCP) += mptcp/ + obj-$(CONFIG_MCTP) += mctp/ ++obj-$(CONFIG_OENETCLS) += oenetcls/ +diff --git a/net/core/dev.c b/net/core/dev.c +index f628494a1..8abe0dea5 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -160,6 +160,12 @@ + /* This should be increased if a protocol with a bigger head is added. */ + #define GRO_MAX_HEAD (MAX_HEADER + 128) + ++#if IS_ENABLED(CONFIG_OENETCLS) ++#include <linux/oenetcls.h> ++const struct oecls_hook_ops __rcu *oecls_ops __read_mostly; ++EXPORT_SYMBOL_GPL(oecls_ops); ++#endif ++ + static DEFINE_SPINLOCK(ptype_lock); + static DEFINE_SPINLOCK(offload_lock); + struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +@@ -4770,6 +4776,10 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + bool expire = true; + unsigned int cpu; + ++#if IS_ENABLED(CONFIG_OENETCLS) ++ if (oenetcls_may_expire_flow(dev, rxq_index, flow_id, filter_id, &expire)) ++ return expire; ++#endif + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { +@@ -5881,6 +5891,9 @@ static int netif_receive_skb_internal(struct sk_buff *skb) + return ret; + } + } ++#endif ++#if IS_ENABLED(CONFIG_OENETCLS) ++ oenetcls_skb_set_cpu(skb); + #endif + ret = __netif_receive_skb(skb); + rcu_read_unlock(); +@@ -5915,6 +5928,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) + } + } + } ++#endif ++#if IS_ENABLED(CONFIG_OENETCLS) ++ oenetcls_skblist_set_cpu(head); + #endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +@@ -10271,6 +10287,9 @@ int __netdev_update_features(struct net_device *dev) + + return err < 0 ? 0 : 1; + } ++#if IS_ENABLED(CONFIG_OENETCLS) ++EXPORT_SYMBOL(__netdev_update_features); ++#endif + + static int netdev_do_alloc_pcpu_stats(struct net_device *dev) + { +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 5dc1955e3..ad4937bc4 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -120,6 +120,9 @@ + #include <net/compat.h> + + #include <trace/events/sock.h> ++#if IS_ENABLED(CONFIG_OENETCLS) ++#include <linux/oenetcls.h> ++#endif + + /* The inetsw table contains everything that inet_create needs to + * build a new socket. +@@ -229,6 +232,9 @@ int inet_listen(struct socket *sock, int backlog) + if (err) + goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); ++#if IS_ENABLED(CONFIG_OENETCLS) ++ oenetcls_cfg_rxcls(sk, 0); ++#endif + } + err = 0; + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index e8b7f0c5d..9309501c1 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -281,6 +281,9 @@ + #include <asm/ioctls.h> + #include <net/busy_poll.h> + #include <linux/dma-buf.h> ++#if IS_ENABLED(CONFIG_OENETCLS) ++#include <linux/oenetcls.h> ++#endif + + /* Track pending CMSGs. */ + enum { +@@ -2940,6 +2943,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + ++#if IS_ENABLED(CONFIG_OENETCLS) ++ oenetcls_flow_update(sk); ++#endif + if (sk_can_busy_loop(sk) && + skb_queue_empty_lockless(&sk->sk_receive_queue) && + sk->sk_state == TCP_ESTABLISHED) +@@ -3300,6 +3306,9 @@ void __tcp_close(struct sock *sk, long timeout) + void tcp_close(struct sock *sk, long timeout) + { + lock_sock(sk); ++#if IS_ENABLED(CONFIG_OENETCLS) ++ oenetcls_cfg_rxcls(sk, 1); ++#endif + __tcp_close(sk, timeout); + release_sock(sk); + sock_put(sk); +diff --git a/net/oenetcls/Kconfig b/net/oenetcls/Kconfig +new file mode 100644 +index 000000000..1b69c744a +--- /dev/null ++++ b/net/oenetcls/Kconfig +@@ -0,0 +1,7 @@ ++# SPDX-License-Identifier: GPL-2.0-only ++config OENETCLS ++ tristate "Network classification" ++ default n ++ help ++ Allow to bind NIC interrupts and configure ntuple rules to ++ achieve sock numa affinity +diff --git a/net/oenetcls/Makefile b/net/oenetcls/Makefile +new file mode 100644 +index 000000000..f6eeed9e8 +--- /dev/null ++++ b/net/oenetcls/Makefile +@@ -0,0 +1,7 @@ ++# SPDX-License-Identifier: GPL-2.0-only ++ ++obj-$(CONFIG_OENETCLS) = oenetcls.o ++oenetcls-y := oenetcls_main.o oenetcls_ntuple.o oenetcls_flow.o ++ifeq ($(CONFIG_ARM64_SVE),y) ++oenetcls-y += memcpy-sve.o ++endif +diff --git a/net/oenetcls/asmdefs.h b/net/oenetcls/asmdefs.h +new file mode 100644 +index 000000000..8138a94c1 +--- /dev/null ++++ b/net/oenetcls/asmdefs.h +@@ -0,0 +1,61 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef _ASMDEFS_H ++#define _ASMDEFS_H ++ ++/* Branch Target Identitication support. */ ++#define BTI_C hint 34 ++#define BTI_J hint 36 ++/* Return address signing support (pac-ret). */ ++#define PACIASP hint 25; .cfi_window_save ++#define AUTIASP hint 29; .cfi_window_save ++ ++/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ ++#define FEATURE_1_AND 0xc0000000 ++#define FEATURE_1_BTI 1 ++#define FEATURE_1_PAC 2 ++ ++/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ ++#define GNU_PROPERTY(type, value) \ ++ .section .note.gnu.property, "a"; \ ++ .p2align 3; \ ++ .word 4; \ ++ .word 16; \ ++ .word 5; \ ++ .asciz "GNU"; \ ++ .word type; \ ++ .word 4; \ ++ .word value; \ ++ .word 0; \ ++ .text ++ ++#ifndef WANT_GNU_PROPERTY ++#define WANT_GNU_PROPERTY 1 ++#endif ++ ++#if WANT_GNU_PROPERTY ++/* Add property note with supported features to all asm files. */ ++GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) ++#endif ++ ++#define ENTRY_ALIGN(name, alignment) \ ++ .global name; \ ++ .type name, %function; \ ++ .align alignment; \ ++name: \ ++ .cfi_startproc; \ ++ BTI_C; ++ ++#define ENTRY(name) ENTRY_ALIGN(name, 6) ++ ++#define ENTRY_ALIAS(name) \ ++ .global name; \ ++ .type name, %function; \ ++ name: ++ ++#define END(name) \ ++ .cfi_endproc; \ ++ .size name, .-name; ++ ++#define L(l) .L ## l ++ ++#endif +diff --git a/net/oenetcls/memcpy-sve.S b/net/oenetcls/memcpy-sve.S +new file mode 100644 +index 000000000..106e4c302 +--- /dev/null ++++ b/net/oenetcls/memcpy-sve.S +@@ -0,0 +1,157 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#include "asmdefs.h" ++ ++.arch armv8-a+sve ++ ++#define dstin x0 ++#define src x1 ++#define count x2 ++#define dst x3 ++#define srcend x4 ++#define dstend x5 ++#define tmp1 x6 ++#define vlen x6 ++ ++#define A_q q0 ++#define B_q q1 ++#define C_q q2 ++#define D_q q3 ++#define E_q q4 ++#define F_q q5 ++#define G_q q6 ++#define H_q q7 ++ ++/* This implementation handles overlaps and supports both memcpy and memmove ++ from a single entry point. It uses unaligned accesses and branchless ++ sequences to keep the code small, simple and improve performance. ++ SVE vectors are used to speedup small copies. ++ ++ Copies are split into 3 main cases: small copies of up to 32 bytes, medium ++ copies of up to 128 bytes, and large copies. The overhead of the overlap ++ check is negligible since it is only required for large copies. ++ ++ Large copies use a software pipelined loop processing 64 bytes per iteration. ++ The source pointer is 16-byte aligned to minimize unaligned accesses. ++ The loop tail is handled by always copying 64 bytes from the end. ++*/ ++ ++ENTRY_ALIAS (__memmove_aarch64_sve) ++ENTRY (__memcpy_aarch64_sve) ++ cmp count, 128 ++ b.hi L(copy_long) ++ cntb vlen ++ cmp count, vlen, lsl 1 ++ b.hi L(copy32_128) ++ ++ whilelo p0.b, xzr, count ++ whilelo p1.b, vlen, count ++ ld1b z0.b, p0/z, [src, 0, mul vl] ++ ld1b z1.b, p1/z, [src, 1, mul vl] ++ st1b z0.b, p0, [dstin, 0, mul vl] ++ st1b z1.b, p1, [dstin, 1, mul vl] ++ ret ++ ++ /* Medium copies: 33..128 bytes. */ ++L(copy32_128): ++ add srcend, src, count ++ add dstend, dstin, count ++ ldp A_q, B_q, [src] ++ ldp C_q, D_q, [srcend, -32] ++ cmp count, 64 ++ b.hi L(copy128) ++ stp A_q, B_q, [dstin] ++ stp C_q, D_q, [dstend, -32] ++ ret ++ ++ /* Copy 65..128 bytes. */ ++L(copy128): ++ ldp E_q, F_q, [src, 32] ++ cmp count, 96 ++ b.ls L(copy96) ++ ldp G_q, H_q, [srcend, -64] ++ stp G_q, H_q, [dstend, -64] ++L(copy96): ++ stp A_q, B_q, [dstin] ++ stp E_q, F_q, [dstin, 32] ++ stp C_q, D_q, [dstend, -32] ++ ret ++ ++ /* Copy more than 128 bytes. */ ++L(copy_long): ++ add srcend, src, count ++ add dstend, dstin, count ++ ++ /* Use backwards copy if there is an overlap. */ ++ sub tmp1, dstin, src ++ cmp tmp1, count ++ b.lo L(copy_long_backwards) ++ ++ /* Copy 16 bytes and then align src to 16-byte alignment. */ ++ ldr D_q, [src] ++ and tmp1, src, 15 ++ bic src, src, 15 ++ sub dst, dstin, tmp1 ++ add count, count, tmp1 /* Count is now 16 too large. */ ++ ldp A_q, B_q, [src, 16] ++ str D_q, [dstin] ++ ldp C_q, D_q, [src, 48] ++ subs count, count, 128 + 16 /* Test and readjust count. */ ++ b.ls L(copy64_from_end) ++L(loop64): ++ stp A_q, B_q, [dst, 16] ++ ldp A_q, B_q, [src, 80] ++ stp C_q, D_q, [dst, 48] ++ ldp C_q, D_q, [src, 112] ++ add src, src, 64 ++ add dst, dst, 64 ++ subs count, count, 64 ++ b.hi L(loop64) ++ ++ /* Write the last iteration and copy 64 bytes from the end. */ ++L(copy64_from_end): ++ ldp E_q, F_q, [srcend, -64] ++ stp A_q, B_q, [dst, 16] ++ ldp A_q, B_q, [srcend, -32] ++ stp C_q, D_q, [dst, 48] ++ stp E_q, F_q, [dstend, -64] ++ stp A_q, B_q, [dstend, -32] ++ ret ++ ++ /* Large backwards copy for overlapping copies. ++ Copy 16 bytes and then align srcend to 16-byte alignment. */ ++L(copy_long_backwards): ++ cbz tmp1, L(return) ++ ldr D_q, [srcend, -16] ++ and tmp1, srcend, 15 ++ bic srcend, srcend, 15 ++ sub count, count, tmp1 ++ ldp A_q, B_q, [srcend, -32] ++ str D_q, [dstend, -16] ++ ldp C_q, D_q, [srcend, -64] ++ sub dstend, dstend, tmp1 ++ subs count, count, 128 ++ b.ls L(copy64_from_start) ++ ++L(loop64_backwards): ++ str B_q, [dstend, -16] ++ str A_q, [dstend, -32] ++ ldp A_q, B_q, [srcend, -96] ++ str D_q, [dstend, -48] ++ str C_q, [dstend, -64]! ++ ldp C_q, D_q, [srcend, -128] ++ sub srcend, srcend, 64 ++ subs count, count, 64 ++ b.hi L(loop64_backwards) ++ ++ /* Write the last iteration and copy 64 bytes from the start. */ ++L(copy64_from_start): ++ ldp E_q, F_q, [src, 32] ++ stp A_q, B_q, [dstend, -32] ++ ldp A_q, B_q, [src] ++ stp C_q, D_q, [dstend, -64] ++ stp E_q, F_q, [dstin, 32] ++ stp A_q, B_q, [dstin] ++L(return): ++ ret ++ ++END (__memcpy_aarch64_sve) +diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h +new file mode 100644 +index 000000000..215ae3e7e +--- /dev/null ++++ b/net/oenetcls/oenetcls.h +@@ -0,0 +1,177 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef _NET_OENETCLS_H ++#define _NET_OENETCLS_H ++#include <linux/if.h> ++#include <linux/mutex.h> ++#include <linux/cpufeature.h> ++ ++#define OECLS_MAX_NETDEV_NUM 8 ++#define OECLS_MAX_RXQ_NUM_PER_DEV 256 ++#define OECLS_MAX_CPU_NUM 1024 ++ ++#define OECLS_TIMEOUT (5 * HZ) ++#define OECLS_NO_FILTER 0xffff ++#define OECLS_NO_CPU 0xffff ++ ++struct oecls_netdev_queue_info { ++ int irq; ++ int affinity_cpu; ++}; ++ ++struct oecls_netdev_info { ++ char dev_name[IFNAMSIZ]; ++ struct net_device *netdev; ++ int rxq_num; ++ struct oecls_netdev_queue_info rxq[OECLS_MAX_RXQ_NUM_PER_DEV]; ++ int old_filter_state; ++}; ++ ++struct oecls_rxq { ++ int rxq_id; ++ int status; ++}; ++ ++struct oecls_numa_clusterinfo { ++ int cluster_id; ++ int cur_freeidx; ++ struct oecls_rxq rxqs[OECLS_MAX_RXQ_NUM_PER_DEV]; ++}; ++ ++struct oecls_numa_bound_dev_info { ++ DECLARE_BITMAP(bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); ++ struct oecls_numa_clusterinfo *cluster_info; ++}; ++ ++struct oecls_numa_info { ++ DECLARE_BITMAP(avail_cpus, OECLS_MAX_CPU_NUM); ++ struct oecls_numa_bound_dev_info bound_dev[OECLS_MAX_NETDEV_NUM]; ++}; ++ ++struct cmd_context { ++ char netdev[IFNAMSIZ]; ++ u32 dip4; ++ u16 dport; ++ u16 action; ++ u32 ruleid; ++ u32 del_ruleid; ++ int ret_loc; ++}; ++ ++#define OECLS_SK_RULE_HASHSIZE 256 ++#define OECLS_SK_RULE_HASHMASK (OECLS_SK_RULE_HASHSIZE - 1) ++ ++struct oecls_sk_rule_list { ++ struct hlist_head hash[OECLS_SK_RULE_HASHSIZE]; ++ /* Mutex to synchronize access to ntuple rule locking */ ++ struct mutex mutex; ++}; ++ ++struct oecls_sk_rule { ++ struct hlist_node node; ++ int devid; ++ void *sk; ++ int dip4; ++ int dport; ++ int action; ++ int ruleid; ++ int nid; ++}; ++ ++struct oecls_sk_entry { ++ struct hlist_node node; ++ void *sk; ++ u32 sk_rule_hash; ++}; ++ ++struct oecls_dev_flow { ++ unsigned short cpu; ++ unsigned short filter; ++ unsigned int last_qtail; ++ int isvalid; ++ unsigned long timeout; ++}; ++ ++struct oecls_dev_flow_table { ++ unsigned int mask; ++ struct rcu_head rcu; ++ struct oecls_dev_flow flows[]; ++}; ++ ++struct oecls_sock_flow_table { ++ u32 mask; ++ u32 ents[] ____cacheline_aligned_in_smp; ++}; ++ ++#define OECLS_DEV_FLOW_TABLE_NUM 0x1000 ++#define OECLS_SOCK_FLOW_TABLE_NUM 0x100000 ++#define OECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct oecls_dev_flow_table) + \ ++ ((_num) * sizeof(struct oecls_dev_flow))) ++#define OECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct oecls_sock_flow_table, ents[_num])) ++ ++#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ++ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) ++#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ ++ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ ++ NETIF_F_RXHASH) ++ ++struct rmgr_ctrl { ++ int driver_select; ++ unsigned long *slot; ++ __u32 n_rules; ++ __u32 size; ++}; ++ ++extern int match_ip_flag; ++extern int debug; ++extern int oecls_netdev_num; ++extern int oecls_numa_num; ++ ++#define oecls_debug(fmt, ...) \ ++ do { \ ++ if (debug) \ ++ trace_printk(fmt, ## __VA_ARGS__); \ ++ } while (0) ++ ++#define oecls_error(fmt, ...) \ ++ do { \ ++ pr_err("oenetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \ ++ trace_printk(fmt, ## __VA_ARGS__); \ ++ } while (0) ++ ++struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index); ++ ++#define for_each_oecls_netdev(devid, oecls_dev) \ ++ for (devid = 0, oecls_dev = get_oecls_netdev_info(devid); \ ++ (devid < oecls_netdev_num) && oecls_dev; \ ++ devid++, oecls_dev = get_oecls_netdev_info(devid)) ++ ++struct oecls_numa_info *get_oecls_numa_info(unsigned int nid); ++ ++#define for_each_oecls_numa(nid, numa_info) \ ++ for (nid = 0, numa_info = get_oecls_numa_info(nid); \ ++ (nid < oecls_numa_num) && numa_info; \ ++ nid++, numa_info = get_oecls_numa_info(nid)) ++ ++#ifdef CONFIG_ARM64_SVE ++void *__memcpy_aarch64_sve(void *, const void *, size_t); ++#define memcpy_r(dst, src, len) \ ++ do { \ ++ if (system_supports_sve()) \ ++ __memcpy_aarch64_sve(dst, src, len); \ ++ else \ ++ memcpy(dst, src, len); \ ++ } while (0) ++#else ++#define memcpy_r(dst, src, len) memcpy(dst, src, len) ++#endif ++ ++int check_appname(char *task_name); ++int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd); ++int alloc_rxq_id(int nid, int devid); ++void free_rxq_id(int nid, int devid, int rxq_id); ++void oecls_ntuple_res_init(void); ++void oecls_ntuple_res_clean(void); ++void oecls_flow_res_init(void); ++void oecls_flow_res_clean(void); ++ ++#endif /* _NET_OENETCLS_H */ +diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c +new file mode 100644 +index 000000000..15ee13405 +--- /dev/null ++++ b/net/oenetcls/oenetcls_flow.c +@@ -0,0 +1,406 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include <linux/inetdevice.h> ++#include <linux/netdevice.h> ++#include <linux/rtnetlink.h> ++#include <linux/irq.h> ++#include <linux/irqdesc.h> ++#include <linux/inet.h> ++#include <linux/oenetcls.h> ++#include <net/sock.h> ++ ++#include "oenetcls.h" ++ ++static u32 oecls_cpu_mask; ++static struct oecls_sock_flow_table __rcu *oecls_sock_flow_table; ++static DEFINE_MUTEX(oecls_sock_flow_mutex); ++static DEFINE_SPINLOCK(oecls_dev_flow_lock); ++ ++bool is_oecls_config_netdev(const char *name) ++{ ++ struct oecls_netdev_info *netdev_info; ++ int netdev_loop; ++ ++ for_each_oecls_netdev(netdev_loop, netdev_info) ++ if (strcmp(netdev_info->dev_name, name) == 0) ++ return true; ++ ++ return false; ++} ++ ++static bool _oecls_timeout(struct net_device *dev, u16 rxq_index, ++ u32 flow_id, u16 filter_id) ++{ ++ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; ++ struct oecls_dev_flow_table *flow_table; ++ struct oecls_dev_flow *rflow; ++ bool expire = true; ++ unsigned int cpu; ++ ++ rcu_read_lock(); ++ flow_table = rcu_dereference(rxqueue->oecls_ftb); ++ if (flow_table && flow_id <= flow_table->mask) { ++ rflow = &flow_table->flows[flow_id]; ++ cpu = READ_ONCE(rflow->cpu); ++ oecls_debug("dev:%s, rxq:%d, flow_id:%u, filter_id:%d/%d, cpu:%d\n", dev->name, ++ rxq_index, flow_id, filter_id, rflow->filter, cpu); ++ ++ if (rflow->filter == filter_id && cpu < nr_cpu_ids) { ++ if (time_before(jiffies, rflow->timeout + OECLS_TIMEOUT)) { ++ expire = false; ++ } else { ++ rflow->isvalid = 0; ++ WRITE_ONCE(rflow->cpu, OECLS_NO_CPU); ++ } ++ } ++ } ++ rcu_read_unlock(); ++ oecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__, ++ dev->name, rxq_index, flow_id, filter_id, expire); ++ return expire; ++} ++ ++static void _oecls_flow_update(struct sock *sk) ++{ ++ struct oecls_sock_flow_table *tb; ++ unsigned int hash, index; ++ u32 val; ++ u32 cpu = raw_smp_processor_id(); ++ ++ if (sk->sk_state != TCP_ESTABLISHED) ++ return; ++ ++ if (check_appname(current->comm)) ++ return; ++ ++ rcu_read_lock(); ++ tb = rcu_dereference(oecls_sock_flow_table); ++ hash = READ_ONCE(sk->sk_rxhash); ++ if (tb && hash) { ++ index = hash & tb->mask; ++ val = hash & ~oecls_cpu_mask; ++ val |= cpu; ++ ++ if (READ_ONCE(tb->ents[index]) != val) { ++ WRITE_ONCE(tb->ents[index], val); ++ ++ oecls_debug("[%s] sk:%p, hash:0x%x, index:0x%x, val:0x%x, cpu:%d\n", ++ current->comm, sk, hash, index, val, cpu); ++ } ++ } ++ rcu_read_unlock(); ++} ++ ++static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb) ++{ ++ struct oecls_netdev_info *netdev_info; ++ int netdev_loop; ++ u32 hash, index; ++ struct oecls_numa_info *numa_info; ++ struct oecls_numa_bound_dev_info *bound_dev = NULL; ++ int rxq_id, rxq_num, i; ++ ++ numa_info = get_oecls_numa_info(nid); ++ if (!numa_info) ++ return -1; ++ ++ for_each_oecls_netdev(netdev_loop, netdev_info) { ++ if (strcmp(netdev_info->dev_name, dev->name) == 0) { ++ bound_dev = &numa_info->bound_dev[netdev_loop]; ++ break; ++ } ++ } ++ ++ if (!bound_dev) ++ return -1; ++ rxq_num = bitmap_weight(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); ++ if (rxq_num == 0) ++ return -1; ++ ++ hash = skb_get_hash(skb); ++ index = hash % rxq_num; ++ ++ i = 0; ++ for_each_set_bit(rxq_id, bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV) ++ if (index == i++) ++ return rxq_id; ++ ++ return -1; ++} ++ ++static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb, ++ struct oecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu) ++{ ++ struct netdev_rx_queue *rxqueue; ++ struct oecls_dev_flow_table *dtb; ++ struct oecls_dev_flow *rflow; ++ u32 flow_id, hash; ++ u16 rxq_index; ++ int rc; ++ ++ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || ++ !(dev->features & NETIF_F_NTUPLE)) ++ return; ++ ++ rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); ++ if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) ++ return; ++ ++ rxqueue = dev->_rx + rxq_index; ++ dtb = rcu_dereference(rxqueue->oecls_ftb); ++ if (!dtb) ++ return; ++ ++ hash = skb_get_hash(skb); ++ flow_id = hash & dtb->mask; ++ rflow = &dtb->flows[flow_id]; ++ if (rflow->isvalid && rflow->cpu == next_cpu) { ++ rflow->timeout = jiffies; ++ return; ++ } ++ ++ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); ++ oecls_debug("skb:%p, rxq:%d, hash:0x%x, flow_id:%u, old_rxq_id:%d, next_cpu:%d, rc:%d\n", ++ skb, rxq_index, hash, flow_id, old_rxq_id, next_cpu, rc); ++ if (rc < 0) ++ return; ++ ++ rflow->filter = rc; ++ rflow->isvalid = 1; ++ rflow->timeout = jiffies; ++ if (old_rflow->filter == rflow->filter) ++ old_rflow->filter = OECLS_NO_FILTER; ++ rflow->cpu = next_cpu; ++} ++ ++static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, ++ struct oecls_sock_flow_table *tb, struct oecls_dev_flow_table *dtb, ++ int old_rxq_id) ++{ ++ struct oecls_dev_flow *rflow; ++ u32 last_recv_cpu, hash, val; ++ u32 tcpu = 0; ++ u32 cpu = raw_smp_processor_id(); ++ ++ skb_reset_network_header(skb); ++ hash = skb_get_hash(skb); ++ if (!hash) ++ return; ++ ++ val = READ_ONCE(tb->ents[hash & tb->mask]); ++ last_recv_cpu = val & oecls_cpu_mask; ++ rflow = &dtb->flows[hash & dtb->mask]; ++ tcpu = rflow->cpu; ++ ++ if ((val ^ hash) & ~oecls_cpu_mask) ++ return; ++ ++ if (cpu_to_node(cpu) == cpu_to_node(last_recv_cpu)) ++ return; ++ ++ if (tcpu >= nr_cpu_ids) ++ set_oecls_cpu(ndev, skb, rflow, old_rxq_id, last_recv_cpu); ++} ++ ++static void _oecls_set_cpu(struct sk_buff *skb) ++{ ++ struct net_device *ndev = skb->dev; ++ struct oecls_sock_flow_table *stb; ++ struct oecls_dev_flow_table *dtb; ++ struct netdev_rx_queue *rxqueue; ++ int rxq_id = -1; ++ ++ if (!ndev) ++ return; ++ ++ if (!is_oecls_config_netdev(ndev->name)) ++ return; ++ ++ rxqueue = ndev->_rx; ++ if (skb_rx_queue_recorded(skb)) { ++ rxq_id = skb_get_rx_queue(skb); ++ if (rxq_id >= ndev->real_num_rx_queues) { ++ oecls_debug("ndev:%s, rxq:%d, real_num:%d\n", ndev->name, ++ rxq_id, ndev->real_num_rx_queues); ++ return; ++ } ++ rxqueue += rxq_id; ++ } ++ ++ // oecls_debug("skb:%px, dev:%s, rxq_id:%d\n", skb, ndev->name, rxq_id); ++ if (rxq_id < 0) ++ return; ++ ++ rcu_read_lock(); ++ stb = rcu_dereference(oecls_sock_flow_table); ++ dtb = rcu_dereference(rxqueue->oecls_ftb); ++ if (stb && dtb) ++ __oecls_set_cpu(skb, ndev, stb, dtb, rxq_id); ++ ++ rcu_read_unlock(); ++} ++ ++static void oecls_dev_flow_table_free(struct rcu_head *rcu) ++{ ++ struct oecls_dev_flow_table *table = container_of(rcu, ++ struct oecls_dev_flow_table, rcu); ++ vfree(table); ++} ++ ++static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int qid) ++{ ++ struct oecls_dev_flow_table *dtb; ++ struct netdev_rx_queue *queue; ++ int i; ++ ++ spin_lock(&oecls_dev_flow_lock); ++ for (i = 0; i < qid; i++) { ++ queue = netdev->_rx + i; ++ dtb = rcu_dereference_protected(queue->oecls_ftb, ++ lockdep_is_held(&oecls_dev_flow_lock)); ++ rcu_assign_pointer(queue->oecls_ftb, NULL); ++ } ++ spin_unlock(&oecls_dev_flow_lock); ++ call_rcu(&dtb->rcu, oecls_dev_flow_table_free); ++} ++ ++static int oecls_dev_flow_table_release(void) ++{ ++ struct oecls_netdev_info *netdev_info; ++ int netdev_loop; ++ struct net_device *netdev; ++ ++ for_each_oecls_netdev(netdev_loop, netdev_info) { ++ netdev = netdev_info->netdev; ++ if (!netdev) ++ continue; ++ oecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues); ++ } ++ ++ return 0; ++} ++ ++static int _oecls_dev_flow_table_init(struct net_device *netdev) ++{ ++ struct oecls_dev_flow_table *table; ++ int size = OECLS_DEV_FLOW_TABLE_NUM; ++ struct netdev_rx_queue *queue; ++ int i, j, ret = 0; ++ ++ size = roundup_pow_of_two(size); ++ oecls_debug("dev:%s, num_rx_queues:%d, mask:0x%x\n", netdev->name, netdev->num_rx_queues, ++ size - 1); ++ ++ for (i = 0; i < netdev->num_rx_queues; i++) { ++ table = vmalloc(OECLS_DEV_FLOW_TABLE_SIZE(size)); ++ if (!table) { ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ table->mask = size - 1; ++ for (j = 0; j < size; j++) { ++ table->flows[j].cpu = OECLS_NO_CPU; ++ table->flows[j].isvalid = 0; ++ } ++ ++ queue = netdev->_rx + i; ++ ++ spin_lock(&oecls_dev_flow_lock); ++ rcu_assign_pointer(queue->oecls_ftb, table); ++ spin_unlock(&oecls_dev_flow_lock); ++ } ++ return ret; ++fail: ++ oecls_dev_flow_table_cleanup(netdev, i); ++ return ret; ++} ++ ++static int oecls_dev_flow_table_init(void) ++{ ++ struct oecls_netdev_info *netdev_info; ++ int netdev_loop; ++ struct net_device *ndev; ++ int i, err; ++ ++ for_each_oecls_netdev(netdev_loop, netdev_info) { ++ ndev = netdev_info->netdev; ++ if (!ndev) ++ continue; ++ err = _oecls_dev_flow_table_init(ndev); ++ if (err) ++ goto out; ++ } ++ ++ return 0; ++out: ++ for (i = 0; i < netdev_loop; i++) { ++ netdev_info = get_oecls_netdev_info(i); ++ ndev = netdev_info->netdev; ++ if (!ndev) ++ continue; ++ oecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues); ++ } ++ return err; ++} ++ ++static const struct oecls_hook_ops oecls_flow_ops = { ++ .oecls_flow_update = _oecls_flow_update, ++ .oecls_set_cpu = _oecls_set_cpu, ++ .oecls_timeout = _oecls_timeout, ++ .oecls_cfg_rxcls = NULL, ++}; ++ ++static int oecls_sock_flow_table_release(void) ++{ ++ struct oecls_sock_flow_table *tb; ++ ++ mutex_lock(&oecls_sock_flow_mutex); ++ tb = rcu_dereference_protected(oecls_sock_flow_table, ++ lockdep_is_held(&oecls_sock_flow_mutex)); ++ if (tb) ++ rcu_assign_pointer(oecls_sock_flow_table, NULL); ++ mutex_unlock(&oecls_sock_flow_mutex); ++ synchronize_rcu(); ++ vfree(tb); ++ ++ return 0; ++} ++ ++static int oecls_sock_flow_table_init(void) ++{ ++ struct oecls_sock_flow_table *table; ++ int size = OECLS_SOCK_FLOW_TABLE_NUM; ++ int i; ++ ++ size = roundup_pow_of_two(size); ++ table = vmalloc(OECLS_SOCK_FLOW_TABLE_SIZE(size)); ++ if (!table) ++ return -ENOMEM; ++ ++ oecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; ++ oecls_debug("nr_cpu_ids:%d, oecls_cpu_mask:0x%x\n", nr_cpu_ids, oecls_cpu_mask); ++ ++ table->mask = size - 1; ++ for (i = 0; i < size; i++) ++ table->ents[i] = OECLS_NO_CPU; ++ ++ mutex_lock(&oecls_sock_flow_mutex); ++ rcu_assign_pointer(oecls_sock_flow_table, table); ++ mutex_unlock(&oecls_sock_flow_mutex); ++ ++ return 0; ++} ++ ++void oecls_flow_res_init(void) ++{ ++ oecls_sock_flow_table_init(); ++ oecls_dev_flow_table_init(); ++ RCU_INIT_POINTER(oecls_ops, &oecls_flow_ops); ++} ++ ++void oecls_flow_res_clean(void) ++{ ++ RCU_INIT_POINTER(oecls_ops, NULL); ++ oecls_sock_flow_table_release(); ++ oecls_dev_flow_table_release(); ++} +diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c +new file mode 100644 +index 000000000..b69d10036 +--- /dev/null ++++ b/net/oenetcls/oenetcls_main.c +@@ -0,0 +1,1075 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include <linux/module.h> ++#include <linux/netdevice.h> ++#include <linux/netdev_features.h> ++#include <linux/ethtool.h> ++#include <linux/irq.h> ++#include <linux/irqdesc.h> ++#include <linux/rtnetlink.h> ++#include "oenetcls.h" ++ ++int oecls_netdev_num; ++static struct oecls_netdev_info oecls_netdev_info_table[OECLS_MAX_NETDEV_NUM]; ++ ++int oecls_numa_num; ++static int oecls_cluster_cpu_num, oecls_cluster_per_numa; ++static struct oecls_numa_info *oecls_numa_info_table; ++ ++int debug; ++module_param(debug, int, 0644); ++MODULE_PARM_DESC(debug, "debug switch"); ++ ++static int mode; ++module_param(mode, int, 0444); ++MODULE_PARM_DESC(mode, "mode, default 0"); ++ ++static char ifname[64] = { 0 }; ++module_param_string(ifname, ifname, sizeof(ifname), 0444); ++MODULE_PARM_DESC(ifname, "ifname"); ++ ++static char appname[64] = "redis-server"; ++module_param_string(appname, appname, sizeof(appname), 0644); ++MODULE_PARM_DESC(appname, "appname, default redis-server"); ++ ++int match_ip_flag = 1; ++module_param(match_ip_flag, int, 0644); ++MODULE_PARM_DESC(match_ip_flag, "match ip flag"); ++ ++static int strategy; ++module_param(strategy, int, 0444); ++MODULE_PARM_DESC(strategy, "strategy, default 0"); ++ ++static bool check_params(void) ++{ ++ if (mode != 0 && mode != 1) ++ return false; ++ ++ if (strlen(ifname) == 0) ++ return false; ++ ++ return true; ++} ++ ++int check_appname(char *task_name) ++{ ++ char *start = appname, *end; ++ ++ if (!strlen(appname)) ++ return 0; ++ ++ // support appname: app1#app2#appN ++ while (*start != '\0') { ++ end = strchr(start, '#'); ++ if (end == start) { ++ start++; ++ continue; ++ } ++ ++ if (!end) { ++ if (!strncmp(task_name, start, strlen(start))) ++ return 0; ++ break; ++ } ++ ++ if (!strncmp(task_name, start, end - start)) ++ return 0; ++ start = end + 1; ++ } ++ return -EOPNOTSUPP; ++} ++ ++static u32 __ethtool_get_flags(struct net_device *dev) ++{ ++ u32 flags = 0; ++ ++ if (dev->features & NETIF_F_LRO) ++ flags |= ETH_FLAG_LRO; ++ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) ++ flags |= ETH_FLAG_RXVLAN; ++ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) ++ flags |= ETH_FLAG_TXVLAN; ++ if (dev->features & NETIF_F_NTUPLE) ++ flags |= ETH_FLAG_NTUPLE; ++ if (dev->features & NETIF_F_RXHASH) ++ flags |= ETH_FLAG_RXHASH; ++ ++ return flags; ++} ++ ++static int __ethtool_set_flags(struct net_device *dev, u32 data) ++{ ++ netdev_features_t features = 0, changed; ++ ++ if (data & ~ETH_ALL_FLAGS) ++ return -EINVAL; ++ ++ if (data & ETH_FLAG_LRO) ++ features |= NETIF_F_LRO; ++ if (data & ETH_FLAG_RXVLAN) ++ features |= NETIF_F_HW_VLAN_CTAG_RX; ++ if (data & ETH_FLAG_TXVLAN) ++ features |= NETIF_F_HW_VLAN_CTAG_TX; ++ if (data & ETH_FLAG_NTUPLE) ++ features |= NETIF_F_NTUPLE; ++ if (data & ETH_FLAG_RXHASH) ++ features |= NETIF_F_RXHASH; ++ ++ /* allow changing only bits set in hw_features */ ++ changed = (features ^ dev->features) & ETH_ALL_FEATURES; ++ if (changed & ~dev->hw_features) ++ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; ++ ++ dev->wanted_features = ++ (dev->wanted_features & ~changed) | (features & changed); ++ ++ __netdev_update_features(dev); ++ ++ return 0; ++} ++ ++static void ethtool_rxnfc_copy_to_user(void *useraddr, ++ const struct ethtool_rxnfc *rxnfc, ++ size_t size, const u32 *rule_buf) ++{ ++ memcpy_r(useraddr, rxnfc, size); ++ useraddr += offsetof(struct ethtool_rxnfc, rule_locs); ++ ++ if (rule_buf) ++ memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32)); ++} ++ ++static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, ++ u32 cmd, void *useraddr) ++{ ++ struct ethtool_rxnfc info; ++ size_t info_size = sizeof(info); ++ int rc; ++ ++ if (!dev->ethtool_ops->set_rxnfc) ++ return -EOPNOTSUPP; ++ ++ if (cmd == ETHTOOL_SRXFH) ++ info_size = (offsetof(struct ethtool_rxnfc, data) + ++ sizeof(info.data)); ++ ++ memcpy_r(&info, useraddr, info_size); ++ rc = dev->ethtool_ops->set_rxnfc(dev, &info); ++ if (rc) ++ return rc; ++ ++ if (cmd == ETHTOOL_SRXCLSRLINS) ++ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); ++ ++ return 0; ++} ++ ++static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, ++ u32 cmd, void *useraddr) ++{ ++ struct ethtool_rxnfc info; ++ size_t info_size = sizeof(info); ++ const struct ethtool_ops *ops = dev->ethtool_ops; ++ int ret; ++ void *rule_buf = NULL; ++ ++ if (!ops->get_rxnfc) ++ return -EOPNOTSUPP; ++ ++ if (cmd == ETHTOOL_GRXFH) ++ info_size = (offsetof(struct ethtool_rxnfc, data) + ++ sizeof(info.data)); ++ ++ memcpy_r(&info, useraddr, info_size); ++ ++ /* If FLOW_RSS was requested then user-space must be using the ++ * new definition, as FLOW_RSS is newer. ++ */ ++ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { ++ info_size = sizeof(info); ++ memcpy_r(&info, useraddr, info_size); ++ /* Since malicious users may modify the original data, ++ * we need to check whether FLOW_RSS is still requested. ++ */ ++ if (!(info.flow_type & FLOW_RSS)) ++ return -EINVAL; ++ } ++ ++ if (info.cmd != cmd) ++ return -EINVAL; ++ ++ if (info.cmd == ETHTOOL_GRXCLSRLALL) { ++ if (info.rule_cnt > 0) { ++ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) ++ rule_buf = kcalloc(info.rule_cnt, sizeof(u32), ++ GFP_KERNEL); ++ if (!rule_buf) ++ return -ENOMEM; ++ } ++ } ++ ++ ret = ops->get_rxnfc(dev, &info, rule_buf); ++ if (ret < 0) ++ goto err_out; ++ ++ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); ++err_out: ++ kfree(rule_buf); ++ ++ return ret; ++} ++ ++static noinline_for_stack int ethtool_get_channels(struct net_device *dev, ++ void *useraddr) ++{ ++ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; ++ ++ if (!dev->ethtool_ops->get_channels) ++ return -EOPNOTSUPP; ++ ++ dev->ethtool_ops->get_channels(dev, &channels); ++ ++ memcpy_r(useraddr, &channels, sizeof(channels)); ++ return 0; ++} ++ ++static int ethtool_get_value(struct net_device *dev, char *useraddr, ++ u32 cmd, u32 (*actor)(struct net_device *)) ++{ ++ struct ethtool_value edata = { .cmd = cmd }; ++ ++ if (!actor) ++ return -EOPNOTSUPP; ++ ++ edata.data = actor(dev); ++ ++ memcpy_r(useraddr, &edata, sizeof(edata)); ++ return 0; ++} ++ ++static int ethtool_set_value(struct net_device *dev, char *useraddr, ++ int (*actor)(struct net_device *, u32)) ++{ ++ struct ethtool_value edata; ++ ++ if (!actor) ++ return -EOPNOTSUPP; ++ ++ memcpy_r(&edata, useraddr, sizeof(edata)); ++ ++ return actor(dev, edata.data); ++} ++ ++static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) ++{ ++ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); ++ void *useraddr = ifr->ifr_data; ++ u32 ethcmd, sub_cmd; ++ int rc; ++ netdev_features_t old_features; ++ ++ if (!dev || !netif_device_present(dev)) ++ return -ENODEV; ++ ++ memcpy_r(ðcmd, useraddr, sizeof(ethcmd)); ++ ++ if (ethcmd == ETHTOOL_PERQUEUE) ++ memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)); ++ else ++ sub_cmd = ethcmd; ++ ++ /* Allow some commands to be done by anyone */ ++ switch (sub_cmd) { ++ case ETHTOOL_GFLAGS: ++ case ETHTOOL_GRXFH: ++ case ETHTOOL_GRXRINGS: ++ case ETHTOOL_GRXCLSRLCNT: ++ case ETHTOOL_GRXCLSRULE: ++ case ETHTOOL_GRXCLSRLALL: ++ case ETHTOOL_GCHANNELS: ++ break; ++ default: ++ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) ++ return -EPERM; ++ } ++ ++ if (dev->ethtool_ops->begin) { ++ rc = dev->ethtool_ops->begin(dev); ++ if (rc < 0) ++ return rc; ++ } ++ old_features = dev->features; ++ ++ switch (ethcmd) { ++ case ETHTOOL_GFLAGS: ++ rc = ethtool_get_value(dev, useraddr, ethcmd, ++ __ethtool_get_flags); ++ break; ++ case ETHTOOL_SFLAGS: ++ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); ++ break; ++ case ETHTOOL_GRXFH: ++ case ETHTOOL_GRXRINGS: ++ case ETHTOOL_GRXCLSRLCNT: ++ case ETHTOOL_GRXCLSRULE: ++ case ETHTOOL_GRXCLSRLALL: ++ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); ++ break; ++ case ETHTOOL_SRXFH: ++ case ETHTOOL_SRXCLSRLDEL: ++ case ETHTOOL_SRXCLSRLINS: ++ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); ++ break; ++ case ETHTOOL_GCHANNELS: ++ rc = ethtool_get_channels(dev, useraddr); ++ break; ++ default: ++ rc = -EOPNOTSUPP; ++ } ++ ++ if (dev->ethtool_ops->complete) ++ dev->ethtool_ops->complete(dev); ++ ++ if (old_features != dev->features) ++ netdev_features_change(dev); ++ ++ return rc; ++} ++ ++int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd) ++{ ++ struct ifreq ifr = {0}; ++ int ret; ++ ++ strncpy(ifr.ifr_name, ctx->netdev, IFNAMSIZ); ++ ifr.ifr_data = cmd; ++ ++ rtnl_lock(); ++ ret = dev_ethtool_kern(&init_net, &ifr); ++ rtnl_unlock(); ++ ++ return ret; ++} ++ ++struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index) ++{ ++ if (index >= OECLS_MAX_NETDEV_NUM) ++ return NULL; ++ return &oecls_netdev_info_table[index]; ++} ++ ++static struct oecls_netdev_info *alloc_oecls_netdev_info(void) ++{ ++ if (oecls_netdev_num >= OECLS_MAX_NETDEV_NUM) ++ return NULL; ++ ++ return &oecls_netdev_info_table[oecls_netdev_num++]; ++} ++ ++static bool check_irq_name(const char *irq_name, struct oecls_netdev_info *oecls_dev) ++{ ++ if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx")) ++ return false; ++ ++ if (strstr(irq_name, oecls_dev->dev_name)) ++ return true; ++ ++ if (oecls_dev->netdev->dev.parent && ++ strstr(irq_name, dev_name(oecls_dev->netdev->dev.parent))) ++ return true; ++ ++ return false; ++} ++ ++static void get_netdev_queue_info(struct oecls_netdev_info *oecls_dev) ++{ ++ struct oecls_netdev_queue_info *rxq_info; ++ struct irq_desc *desc; ++ int irq, cpu; ++ ++ for_each_irq_desc(irq, desc) { ++ if (!desc->action) ++ continue; ++ if (!desc->action->name) ++ continue; ++ if (!check_irq_name(desc->action->name, oecls_dev)) ++ continue; ++ if (oecls_dev->rxq_num >= OECLS_MAX_RXQ_NUM_PER_DEV) ++ break; ++ rxq_info = &oecls_dev->rxq[oecls_dev->rxq_num++]; ++ rxq_info->irq = irq; ++ cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data)); ++ rxq_info->affinity_cpu = cpu; ++ oecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n", ++ irq, desc->action->name, oecls_dev->rxq_num, cpu); ++ } ++} ++ ++static int oecls_filter_enable(const char *dev_name, bool *old_state) ++{ ++ struct ethtool_value eval = {0}; ++ struct cmd_context ctx = {0}; ++ int ret; ++ ++ strncpy(ctx.netdev, dev_name, IFNAMSIZ); ++ ++ eval.cmd = ETHTOOL_GFLAGS; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); ++ return ret; ++ } ++ if (eval.data & ETH_FLAG_NTUPLE) { ++ *old_state = true; ++ oecls_debug("%s ntuple is already on\n", dev_name); ++ return 0; ++ } ++ ++ // Set ntuple feature ++ eval.cmd = ETHTOOL_SFLAGS; ++ eval.data |= ETH_FLAG_NTUPLE; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ oecls_error("set %s flags fail, ret:%d\n", dev_name, ret); ++ return ret; ++ } ++ ++ // Get ntuple feature ++ eval.cmd = ETHTOOL_GFLAGS; ++ eval.data = 0; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); ++ return ret; ++ } ++ if (!(eval.data & ETH_FLAG_NTUPLE)) { ++ oecls_error("enable ntuple feature fail!\n"); ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++static void oecls_filter_restore(const char *dev_name, bool old_state) ++{ ++ struct ethtool_value eval = {0}; ++ struct cmd_context ctx = {0}; ++ bool cur_filter_state; ++ int ret; ++ ++ strncpy(ctx.netdev, dev_name, IFNAMSIZ); ++ ++ eval.cmd = ETHTOOL_GFLAGS; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); ++ return; ++ } ++ ++ cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false; ++ if (cur_filter_state == old_state) ++ return; ++ ++ // Set ntuple feature ++ eval.cmd = ETHTOOL_SFLAGS; ++ if (old_state) ++ eval.data |= ETH_FLAG_NTUPLE; ++ else ++ eval.data &= ~ETH_FLAG_NTUPLE; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ oecls_error("set %s flags fail, ret:%d\n", dev_name, ret); ++ return; ++ } ++} ++ ++static int init_single_oecls_dev(char *if_name, unsigned int length) ++{ ++ struct oecls_netdev_info *oecls_dev; ++ char dev_name[IFNAMSIZ] = { 0 }; ++ struct net_device *netdev; ++ int cpy_len = length < IFNAMSIZ ? length : IFNAMSIZ; ++ bool old_state = false; ++ int ret; ++ ++ strncpy(dev_name, if_name, cpy_len); ++ netdev = dev_get_by_name(&init_net, dev_name); ++ if (!netdev) { ++ oecls_error("dev [%s] is not exist!\n", dev_name); ++ return -ENODEV; ++ } ++ ++ if (!(netdev->flags & IFF_UP)) { ++ ret = -ENETDOWN; ++ oecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags); ++ goto out; ++ } ++ ++ if (netdev->flags & IFF_LOOPBACK) { ++ ret = -EOPNOTSUPP; ++ oecls_error("Do not support loopback.\n"); ++ goto out; ++ } ++ ++ ret = oecls_filter_enable(dev_name, &old_state); ++ if (ret) { ++ oecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); ++ goto out; ++ } ++ ++ oecls_dev = alloc_oecls_netdev_info(); ++ if (!oecls_dev) { ++ ret = -ENOMEM; ++ oecls_filter_restore(dev_name, old_state); ++ oecls_error("alloc oecls_dev fail! oecls_netdev_num:%d\n", oecls_netdev_num); ++ goto out; ++ } ++ ++ memcpy_r(oecls_dev->dev_name, dev_name, IFNAMSIZ); ++ oecls_dev->old_filter_state = old_state; ++ oecls_dev->netdev = netdev; ++ get_netdev_queue_info(oecls_dev); ++ return 0; ++ ++out: ++ dev_put(netdev); ++ return ret; ++} ++ ++static void clean_oecls_netdev_info(void) ++{ ++ struct oecls_netdev_info *oecls_dev; ++ struct net_device *netdev; ++ int devid; ++ ++ for_each_oecls_netdev(devid, oecls_dev) { ++ oecls_filter_restore(oecls_dev->dev_name, oecls_dev->old_filter_state); ++ netdev = oecls_dev->netdev; ++ if (netdev) { ++ oecls_dev->netdev = NULL; ++ dev_put(netdev); ++ } ++ } ++ ++ oecls_netdev_num = 0; ++} ++ ++static int init_oecls_netdev_info(char *netdev_str) ++{ ++ char *start = netdev_str, *end; ++ int err = -ENODEV; ++ ++ while (*start != '\0') { ++ // skip start # ++ end = strchr(start, '#'); ++ if (end == start) { ++ start++; ++ continue; ++ } ++ ++ // find the last ifname ++ if (!end) { ++ err = init_single_oecls_dev(start, strlen(start)); ++ break; ++ } ++ ++ err = init_single_oecls_dev(start, end - start); ++ if (err) ++ break; ++ start = end + 1; ++ } ++ ++ return err; ++} ++ ++struct oecls_numa_info *get_oecls_numa_info(unsigned int nid) ++{ ++ if (nid >= oecls_numa_num) ++ return NULL; ++ return &oecls_numa_info_table[nid]; ++} ++ ++static void clean_oecls_numa_info(void) ++{ ++ oecls_numa_num = 0; ++ kfree(oecls_numa_info_table); ++} ++ ++static void init_numa_avail_cpus(int nid, struct oecls_numa_info *numa_info) ++{ ++ int cpu; ++ ++ oecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)), ++ cpumask_pr_args(cpumask_of_node(nid))); ++ ++ bitmap_zero(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); ++ for_each_cpu(cpu, cpumask_of_node(nid)) { ++ if (cpu >= OECLS_MAX_CPU_NUM) ++ return; ++ set_bit(cpu, numa_info->avail_cpus); ++ } ++} ++ ++static void clean_oecls_rxq(void) ++{ ++ struct oecls_numa_bound_dev_info *bound_dev; ++ struct oecls_netdev_info *oecls_dev; ++ struct oecls_numa_info *numa_info; ++ int nid, devid; ++ ++ for_each_oecls_numa(nid, numa_info) { ++ for_each_oecls_netdev(devid, oecls_dev) { ++ bound_dev = &numa_info->bound_dev[devid]; ++ kfree(bound_dev->cluster_info); ++ } ++ } ++} ++ ++static int init_numa_rxq_bitmap(int nid, struct oecls_numa_info *numa_info) ++{ ++ int bound_rxq_num, cluster_id, cluster_idx, cur_idx; ++ struct oecls_numa_bound_dev_info *bound_dev; ++ struct oecls_netdev_info *oecls_dev; ++ int rxq_id, devid, cpu, ret = 0; ++ ++ for_each_oecls_netdev(devid, oecls_dev) { ++ bound_rxq_num = 0; ++ bound_dev = &numa_info->bound_dev[devid]; ++ bitmap_zero(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); ++ bound_dev->cluster_info = kzalloc(sizeof(struct oecls_numa_clusterinfo) ++ * oecls_cluster_per_numa, GFP_ATOMIC); ++ if (!bound_dev->cluster_info) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { ++ cpu = oecls_dev->rxq[rxq_id].affinity_cpu; ++ if (cpu_to_node(cpu) == nid) { ++ set_bit(rxq_id, bound_dev->bitmap_rxq); ++ cluster_id = cpu / oecls_cluster_cpu_num; ++ cluster_idx = cluster_id % oecls_cluster_per_numa; ++ bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id; ++ cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++; ++ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id; ++ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 1; ++ bound_rxq_num++; ++ oecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n", ++ cpu, cluster_id, cluster_idx, rxq_id, cur_idx); ++ } ++ } ++ ++ oecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bit_num:%d, bitmap_rxq:%*pbl\n", ++ nid, devid, oecls_dev->dev_name, oecls_dev->rxq_num, ++ bound_rxq_num, OECLS_MAX_RXQ_NUM_PER_DEV, bound_dev->bitmap_rxq); ++ } ++ return ret; ++ ++out: ++ clean_oecls_rxq(); ++ return ret; ++} ++ ++static int get_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev) ++{ ++ int cpu = smp_processor_id(); ++ int cluster_id = cpu / oecls_cluster_cpu_num; ++ int i, j, rxq_id; ++ ++ for (i = 0; i < oecls_cluster_per_numa; i++) { ++ if (cluster_id != bound_dev->cluster_info[i].cluster_id) ++ continue; ++ for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) { ++ if (bound_dev->cluster_info[i].rxqs[j].status == 1) { ++ bound_dev->cluster_info[i].rxqs[j].status = 2; ++ rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id; ++ oecls_debug("cluster:%d cpu:%d alloc rxq_id:%d\n", ++ cluster_id, cpu, rxq_id); ++ return rxq_id; ++ } ++ } ++ } ++ oecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu); ++ return -1; ++} ++ ++static int put_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev, int rxq_id) ++{ ++ int i, j; ++ ++ for (i = 0; i < oecls_cluster_per_numa; i++) { ++ for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) { ++ if (bound_dev->cluster_info[i].rxqs[j].status == 2 && ++ bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) { ++ bound_dev->cluster_info[i].rxqs[j].status = 1; ++ oecls_debug("free rxq_id:%d\n", rxq_id); ++ return 0; ++ } ++ } ++ } ++ oecls_debug("no match malloced rxq_id:%d\n", rxq_id); ++ return -1; ++} ++ ++int alloc_rxq_id(int nid, int devid) ++{ ++ struct oecls_numa_bound_dev_info *bound_dev; ++ struct oecls_numa_info *numa_info; ++ int rxq_id; ++ ++ numa_info = get_oecls_numa_info(nid); ++ if (!numa_info) { ++ oecls_error("error nid:%d\n", nid); ++ return -EINVAL; ++ } ++ ++ if (devid >= OECLS_MAX_NETDEV_NUM) { ++ oecls_error("error bound_dev index:%d\n", devid); ++ return -EINVAL; ++ } ++ bound_dev = &numa_info->bound_dev[devid]; ++ ++ if (strategy == 1) { ++ rxq_id = get_cluster_rxq(bound_dev); ++ if (rxq_id < 0 || rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) ++ pr_info("failed to get rxq_id:%d in cluster, try numa\n", rxq_id); ++ else ++ goto found; ++ } ++ ++ rxq_id = find_first_bit(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); ++ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) { ++ oecls_error("error rxq_id:%d\n", rxq_id); ++ return -EINVAL; ++ } ++ ++found: ++ clear_bit(rxq_id, bound_dev->bitmap_rxq); ++ oecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); ++ return rxq_id; ++} ++ ++void free_rxq_id(int nid, int devid, int rxq_id) ++{ ++ struct oecls_numa_bound_dev_info *bound_dev; ++ struct oecls_numa_info *numa_info; ++ ++ numa_info = get_oecls_numa_info(nid); ++ if (!numa_info) { ++ oecls_error("error nid:%d\n", nid); ++ return; ++ } ++ ++ if (devid >= OECLS_MAX_NETDEV_NUM) { ++ oecls_error("error bound_dev index:%d\n", devid); ++ return; ++ } ++ bound_dev = &numa_info->bound_dev[devid]; ++ ++ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) { ++ oecls_error("error rxq_id:%d\n", rxq_id); ++ return; ++ } ++ ++ if (strategy == 1) ++ put_cluster_rxq(bound_dev, rxq_id); ++ ++ if (test_bit(rxq_id, bound_dev->bitmap_rxq)) { ++ oecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id); ++ return; ++ } ++ ++ set_bit(rxq_id, bound_dev->bitmap_rxq); ++ oecls_debug("free nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); ++} ++ ++static int init_oecls_numa_info(void) ++{ ++ struct oecls_numa_info *numa_info; ++ int nid, ret = 0; ++ ++ oecls_numa_num = num_online_nodes(); ++ oecls_numa_info_table = kzalloc(sizeof(struct oecls_numa_info) * oecls_numa_num, GFP_ATOMIC); ++ if (!oecls_numa_info_table) { ++ ret = -ENOMEM; ++ oecls_error("oecls_numa_info_table alloc failed:%d\n", ret); ++ return ret; ++ } ++ ++ oecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(smp_processor_id())); ++ oecls_cluster_per_numa = (nr_cpu_ids / oecls_cluster_cpu_num) / oecls_numa_num; ++ oecls_debug("oecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n", ++ oecls_numa_num, oecls_cluster_per_numa, oecls_cluster_cpu_num); ++ ++ for_each_oecls_numa(nid, numa_info) ++ init_numa_avail_cpus(nid, numa_info); ++ ++ return ret; ++} ++ ++static int alloc_available_cpu(int nid, struct oecls_numa_info *numa_info) ++{ ++ int cpu; ++ ++ cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); ++ if (cpu >= OECLS_MAX_CPU_NUM) { ++ oecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu); ++ return -1; ++ } ++ ++ clear_bit(cpu, numa_info->avail_cpus); ++ return cpu; ++} ++ ++static void add_netdev_irq_affinity_cpu(struct oecls_netdev_info *oecls_dev, int rxq_id, int cpu) ++{ ++ struct oecls_netdev_queue_info *rxq_info; ++ ++ if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) ++ return; ++ ++ rxq_info = &oecls_dev->rxq[rxq_id]; ++ rxq_info->affinity_cpu = cpu; ++} ++ ++static void config_affinity_strategy_default(struct oecls_netdev_info *oecls_dev) ++{ ++ struct oecls_numa_info *numa_info; ++ int rxq_num = oecls_dev->rxq_num; ++ int rxq_per_numa = rxq_num / oecls_numa_num; ++ int remain = rxq_num - rxq_per_numa * oecls_numa_num; ++ int numa_rxq_id, rxq_id, nid, cpu; ++ ++ oecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", oecls_dev->dev_name, ++ rxq_num, rxq_per_numa, remain); ++ ++ // average config rxq to every numa ++ for_each_oecls_numa(nid, numa_info) { ++ for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) { ++ cpu = alloc_available_cpu(nid, numa_info); ++ if (cpu < 0) ++ break; ++ ++ rxq_id = rxq_per_numa * nid + numa_rxq_id; ++ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu); ++ } ++ } ++ ++ if (!remain) ++ return; ++ ++ // config remain rxq to every numa ++ numa_rxq_id = 0; ++ for_each_oecls_numa(nid, numa_info) { ++ if (numa_rxq_id >= remain) ++ break; ++ cpu = alloc_available_cpu(nid, numa_info); ++ if (cpu < 0) ++ break; ++ ++ rxq_id = rxq_per_numa * oecls_numa_num + numa_rxq_id; ++ numa_rxq_id++; ++ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu); ++ } ++} ++ ++static void config_affinity_strategy_cluster(struct oecls_netdev_info *oecls_dev) ++{ ++ int rxq_num = oecls_dev->rxq_num; ++ int rxq_per_numa = rxq_num / oecls_numa_num; ++ int remain = rxq_num - rxq_per_numa * oecls_numa_num; ++ int cpu_idx = oecls_cluster_cpu_num - 1; ++ int cluster, cpu, rxq_id = 0, round; ++ ++ round = rxq_per_numa < oecls_cluster_per_numa ? rxq_per_numa : oecls_cluster_per_numa; ++ if (remain > 0) ++ round++; ++ oecls_debug("round=%d\n", round); ++ ++ while (rxq_id < oecls_dev->rxq_num) { ++ for (cluster = 0; cluster < oecls_cluster_per_numa * oecls_numa_num; cluster++) { ++ if (cluster % oecls_cluster_per_numa >= round) ++ continue; ++ cpu = cluster * oecls_cluster_cpu_num + cpu_idx; ++ if (rxq_id >= oecls_dev->rxq_num) ++ break; ++ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); ++ } ++ cpu_idx--; ++ if (--cpu_idx < 0) ++ cpu_idx = oecls_cluster_cpu_num - 1; ++ } ++} ++ ++static void config_affinity_strategy_numa(struct oecls_netdev_info *oecls_dev) ++{ ++ int rxq_num = oecls_dev->rxq_num; ++ int rxq_per_numa = rxq_num / oecls_numa_num; ++ int cpu_per_numa = nr_cpu_ids / oecls_numa_num; ++ int remain = rxq_num - rxq_per_numa * oecls_numa_num; ++ struct oecls_numa_info *numa_info; ++ int numa_start_cpu, numa_cpu_id; ++ int rxq_id = 0, nid, cpu; ++ ++ for_each_oecls_numa(nid, numa_info) { ++ numa_start_cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); ++ for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) { ++ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); ++ if (rxq_id >= oecls_dev->rxq_num) ++ break; ++ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); ++ } ++ if (remain-- > 0) { ++ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); ++ add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); ++ } ++ } ++} ++ ++static void config_affinity_strategy_custom(struct oecls_netdev_info *oecls_dev) ++{ ++ oecls_debug("dev=%s\n", oecls_dev->dev_name); ++} ++ ++static void config_affinity_strategy(void) ++{ ++ struct oecls_netdev_info *oecls_dev; ++ int devid; ++ ++ for_each_oecls_netdev(devid, oecls_dev) { ++ switch (strategy) { ++ case 1: ++ config_affinity_strategy_cluster(oecls_dev); ++ break; ++ case 2: ++ config_affinity_strategy_numa(oecls_dev); ++ break; ++ case 3: ++ config_affinity_strategy_custom(oecls_dev); ++ break; ++ case 0: ++ default: ++ config_affinity_strategy_default(oecls_dev); ++ break; ++ } ++ } ++} ++ ++static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu) ++{ ++ int err = 0; ++ ++ err = irq_set_affinity(irq, get_cpu_mask(cpu)); ++ oecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err); ++} ++ ++static void enable_affinity_strategy(void) ++{ ++ struct oecls_netdev_queue_info *rxq_info; ++ struct oecls_netdev_info *oecls_dev; ++ int rxq_id, devid; ++ ++ for_each_oecls_netdev(devid, oecls_dev) { ++ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { ++ rxq_info = &oecls_dev->rxq[rxq_id]; ++ irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu); ++ } ++ } ++} ++ ++static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id, ++ const struct cpumask *cpu_mask) ++{ ++ int err = 0; ++ ++ err = netif_set_xps_queue(netdev, cpu_mask, rxq_id); ++ oecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id, ++ cpumask_pr_args(cpu_mask), err); ++} ++ ++static void set_netdev_xps_queue(bool enable) ++{ ++ const struct cpumask clear_mask = { 0 }; ++ struct oecls_netdev_info *oecls_dev; ++ const struct cpumask *cpu_mask; ++ int rxq_id, devid, cpu, nid; ++ ++ for_each_oecls_netdev(devid, oecls_dev) { ++ for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { ++ cpu = oecls_dev->rxq[rxq_id].affinity_cpu; ++ nid = cpu_to_node(cpu); ++ if (enable) ++ cpu_mask = cpumask_of_node(nid); ++ else ++ cpu_mask = &clear_mask; ++ ++ netif_set_xps_queue_wrapper(oecls_dev->netdev, rxq_id, cpu_mask); ++ } ++ } ++} ++ ++static __init int oecls_init(void) ++{ ++ struct oecls_numa_info *numa_info; ++ int nid, err; ++ ++ if (!check_params()) ++ return -EINVAL; ++ ++ err = init_oecls_numa_info(); ++ if (err) ++ return err; ++ ++ err = init_oecls_netdev_info(ifname); ++ if (err) ++ goto clean_numa; ++ ++ // Set irq affinity ++ config_affinity_strategy(); ++ enable_affinity_strategy(); ++ ++ // Calculate rxq bounded to one numa ++ for_each_oecls_numa(nid, numa_info) { ++ err = init_numa_rxq_bitmap(nid, numa_info); ++ if (err) ++ goto clean_rxq; ++ } ++ ++#ifdef CONFIG_XPS ++ set_netdev_xps_queue(true); ++#endif ++ ++ if (mode == 0) ++ oecls_ntuple_res_init(); ++ else ++ oecls_flow_res_init(); ++ ++ return 0; ++ ++clean_rxq: ++clean_numa: ++ clean_oecls_netdev_info(); ++ clean_oecls_numa_info(); ++ return err; ++} ++ ++static __exit void oecls_exit(void) ++{ ++ if (mode == 0) ++ oecls_ntuple_res_clean(); ++ else ++ oecls_flow_res_clean(); ++ ++#ifdef CONFIG_XPS ++ set_netdev_xps_queue(false); ++#endif ++ ++ clean_oecls_rxq(); ++ clean_oecls_netdev_info(); ++ clean_oecls_numa_info(); ++} ++ ++module_init(oecls_init); ++module_exit(oecls_exit); ++ ++MODULE_DESCRIPTION("oenetcls"); ++MODULE_LICENSE("GPL v2"); +diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c +new file mode 100644 +index 000000000..2845a5756 +--- /dev/null ++++ b/net/oenetcls/oenetcls_ntuple.c +@@ -0,0 +1,573 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include <linux/inetdevice.h> ++#include <linux/ethtool.h> ++#include <linux/netdevice.h> ++#include <linux/rtnetlink.h> ++#include <linux/irq.h> ++#include <linux/irqdesc.h> ++#include <linux/inet.h> ++#include <linux/jhash.h> ++#include <linux/oenetcls.h> ++#include <net/sock.h> ++ ++#include "oenetcls.h" ++ ++struct oecls_sk_rule_list oecls_sk_rules, oecls_sk_list; ++ ++static void init_oecls_sk_rules(void) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) ++ INIT_HLIST_HEAD(oecls_sk_rules.hash + i); ++ mutex_init(&oecls_sk_rules.mutex); ++} ++ ++static inline struct hlist_head *get_rule_hashlist(u32 dip4, u16 dport) ++{ ++ return oecls_sk_rules.hash + (jhash_2words(dip4, dport, 0) & OECLS_SK_RULE_HASHMASK); ++} ++ ++static inline struct hlist_head *get_sk_hashlist(void *sk) ++{ ++ return oecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & OECLS_SK_RULE_HASHMASK); ++} ++ ++static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, ++ int ruleid, int nid) ++{ ++ struct hlist_head *hlist = get_rule_hashlist(dip4, dport); ++ struct hlist_head *sk_hlist = get_sk_hashlist(sk); ++ struct oecls_sk_rule *rule; ++ struct oecls_sk_entry *entry; ++ ++ rule = kzalloc(sizeof(struct oecls_sk_rule), GFP_ATOMIC); ++ entry = kzalloc(sizeof(struct oecls_sk_entry), GFP_ATOMIC); ++ if (!rule || !entry) ++ goto out; ++ ++ rule->sk = sk; ++ rule->dip4 = dip4; ++ rule->dport = dport; ++ rule->devid = devid; ++ rule->action = action; ++ rule->ruleid = ruleid; ++ rule->nid = nid; ++ hlist_add_head(&rule->node, hlist); ++ ++ entry->sk = sk; ++ entry->sk_rule_hash = jhash_2words(dip4, dport, 0); ++ hlist_add_head(&entry->node, sk_hlist); ++ return; ++out: ++ oecls_debug("alloc failed rule:%p entry:%p\n", rule, entry); ++ kfree(entry); ++ kfree(rule); ++} ++ ++static struct oecls_sk_entry *get_sk_entry(void *sk) ++{ ++ struct hlist_head *sk_hlist = get_sk_hashlist(sk); ++ struct oecls_sk_entry *entry = NULL; ++ ++ hlist_for_each_entry(entry, sk_hlist, node) { ++ if (entry->sk == sk) ++ break; ++ } ++ return entry; ++} ++ ++static void del_sk_rule(struct oecls_sk_rule *rule) ++{ ++ struct oecls_sk_entry *entry; ++ ++ entry = get_sk_entry(rule->sk); ++ if (!entry) ++ return; ++ hlist_del_init(&entry->node); ++ kfree(entry); ++ ++ oecls_debug("del rule=%p\n", rule); ++ hlist_del_init(&rule->node); ++ kfree(rule); ++} ++ ++static struct oecls_sk_rule *get_sk_rule(int devid, u32 dip4, u16 dport) ++{ ++ struct hlist_head *hlist = get_rule_hashlist(dip4, dport); ++ struct oecls_sk_rule *rule = NULL; ++ ++ hlist_for_each_entry(rule, hlist, node) { ++ if (rule->devid == devid && rule->dip4 == dip4 && rule->dport == dport) ++ break; ++ } ++ return rule; ++} ++ ++static struct oecls_sk_rule *get_rule_from_sk(int devid, void *sk) ++{ ++ struct oecls_sk_rule *rule = NULL; ++ struct oecls_sk_entry *entry; ++ struct hlist_head *hlist; ++ ++ entry = get_sk_entry(sk); ++ if (!entry) ++ return NULL; ++ ++ hlist = oecls_sk_rules.hash + (entry->sk_rule_hash & OECLS_SK_RULE_HASHMASK); ++ hlist_for_each_entry(rule, hlist, node) { ++ if (rule->devid == devid && rule->sk == sk) ++ break; ++ } ++ return rule; ++} ++ ++static inline bool reuseport_check(int devid, u32 dip4, u16 dport) ++{ ++ return !!get_sk_rule(devid, dip4, dport); ++} ++ ++static u32 get_first_ip4_addr(struct net *net) ++{ ++ struct in_device *in_dev; ++ struct net_device *dev; ++ struct in_ifaddr *ifa; ++ u32 dip4 = 0; ++ ++ rtnl_lock(); ++ rcu_read_lock(); ++ for_each_netdev(net, dev) { ++ if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) ++ continue; ++ in_dev = __in_dev_get_rcu(dev); ++ if (!in_dev) ++ continue; ++ ++ in_dev_for_each_ifa_rcu(ifa, in_dev) { ++ if (!strcmp(dev->name, ifa->ifa_label)) { ++ dip4 = ifa->ifa_local; ++ oecls_debug("dev: %s, dip4:%pI4\n", dev->name, &dip4); ++ goto out; ++ } ++ } ++ } ++out: ++ rcu_read_unlock(); ++ rtnl_unlock(); ++ return dip4; ++} ++ ++static void get_sk_rule_addr(struct sock *sk, u32 *dip4, u16 *dport) ++{ ++ *dport = htons(sk->sk_num); ++ ++ if (!match_ip_flag) { ++ *dip4 = 0; ++ return; ++ } ++ ++ if (sk->sk_rcv_saddr) ++ *dip4 = sk->sk_rcv_saddr; ++ else ++ *dip4 = get_first_ip4_addr(sock_net(sk)); ++} ++ ++static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc) ++{ ++ struct ethtool_rxnfc nfccmd; ++ int err; ++ ++ nfccmd.cmd = ETHTOOL_SRXCLSRLDEL; ++ nfccmd.fs.location = loc; ++ err = send_ethtool_ioctl(ctx, &nfccmd); ++ if (err < 0) ++ oecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc); ++ return err; ++} ++ ++static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc) ++{ ++ if (loc >= rmgr->size) { ++ oecls_error("rmgr: Location out of range\n"); ++ return -1; ++ } ++ ++ set_bit(loc, rmgr->slot); ++ return 0; ++} ++ ++static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp) ++{ ++ __u32 loc, slot_num; ++ ++ if (rmgr->driver_select) ++ return 0; ++ ++ loc = rmgr->size - 1; ++ slot_num = loc / BITS_PER_LONG; ++ if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) { ++ loc -= 1 + (loc % BITS_PER_LONG); ++ slot_num--; ++ } ++ ++ while (loc < rmgr->size && !~(rmgr->slot[slot_num])) { ++ loc -= BITS_PER_LONG; ++ slot_num--; ++ } ++ ++ while (loc < rmgr->size && test_bit(loc, rmgr->slot)) ++ loc--; ++ ++ if (loc < rmgr->size) { ++ fsp->location = loc; ++ return rmgr_ins(rmgr, loc); ++ } ++ ++ return -1; ++} ++ ++static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select) ++{ ++ struct ethtool_rxnfc nfccmd; ++ int err; ++ ++ nfccmd.cmd = ETHTOOL_GRXCLSRLCNT; ++ nfccmd.data = 0; ++ err = send_ethtool_ioctl(ctx, &nfccmd); ++ *count = nfccmd.rule_cnt; ++ if (driver_select) ++ *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL); ++ if (err < 0) ++ oecls_debug("rxclass: Cannot get RX class rule count\n"); ++ ++ return err; ++} ++ ++static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr) ++{ ++ struct ethtool_rxnfc *nfccmd; ++ __u32 *rule_locs; ++ int i, err = 0; ++ ++ memset(rmgr, 0, sizeof(*rmgr)); ++ err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select); ++ if (err < 0) ++ return err; ++ ++ if (rmgr->driver_select) ++ return err; ++ ++ nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC); ++ if (!nfccmd) { ++ oecls_error("rmgr: Cannot allocate memory for RX class rule locations\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ nfccmd->cmd = ETHTOOL_GRXCLSRLALL; ++ nfccmd->rule_cnt = rmgr->n_rules; ++ err = send_ethtool_ioctl(ctx, nfccmd); ++ if (err < 0) { ++ oecls_debug("rmgr: Cannot get RX class rules\n"); ++ goto out; ++ } ++ ++ rmgr->size = nfccmd->data; ++ if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) { ++ oecls_error("rmgr: Invalid RX class rules table size\n"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC); ++ if (!rmgr->slot) { ++ oecls_error("rmgr: Cannot allocate memory for RX class rules\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ rule_locs = nfccmd->rule_locs; ++ for (i = 0; i < rmgr->n_rules; i++) { ++ err = rmgr_ins(rmgr, rule_locs[i]); ++ if (err < 0) ++ break; ++ } ++ ++out: ++ kfree(nfccmd); ++ return err; ++} ++ ++static void rmgr_cleanup(struct rmgr_ctrl *rmgr) ++{ ++ kfree(rmgr->slot); ++ rmgr->slot = NULL; ++ rmgr->size = 0; ++} ++ ++static int rmgr_set_location(struct cmd_context *ctx, ++ struct ethtool_rx_flow_spec *fsp) ++{ ++ struct rmgr_ctrl rmgr; ++ int ret; ++ ++ ret = rmgr_init(ctx, &rmgr); ++ if (ret < 0) ++ goto out; ++ ++ ret = rmgr_find_empty_slot(&rmgr, fsp); ++out: ++ rmgr_cleanup(&rmgr); ++ return ret; ++} ++ ++static int rxclass_rule_ins(struct cmd_context *ctx, ++ struct ethtool_rx_flow_spec *fsp, u32 rss_context) ++{ ++ struct ethtool_rxnfc nfccmd; ++ u32 loc = fsp->location; ++ int ret; ++ ++ if (loc & RX_CLS_LOC_SPECIAL) { ++ ret = rmgr_set_location(ctx, fsp); ++ if (ret < 0) ++ return ret; ++ } ++ ++ nfccmd.cmd = ETHTOOL_SRXCLSRLINS; ++ nfccmd.rss_context = rss_context; ++ nfccmd.fs = *fsp; ++ ret = send_ethtool_ioctl(ctx, &nfccmd); ++ if (ret < 0) { ++ oecls_debug("Can not insert the clasification rule\n"); ++ return ret; ++ } ++ ++ if (loc & RX_CLS_LOC_SPECIAL) ++ oecls_debug("Added rule with ID %d\n", nfccmd.fs.location); ++ ++ return 0; ++} ++ ++static void flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp, ++ struct ethtool_rx_ntuple_flow_spec *ntuple) ++{ ++ int i; ++ ++ memset(ntuple, ~0, sizeof(*ntuple)); ++ ntuple->flow_type = fsp->flow_type; ++ ntuple->action = fsp->ring_cookie; ++ memcpy_r(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u)); ++ memcpy_r(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u)); ++ for (i = 0; i < sizeof(ntuple->m_u); i++) ++ ntuple->m_u.hdata[i] ^= 0xFF; ++ ntuple->flow_type &= ~FLOW_EXT; ++} ++ ++static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fsp) ++{ ++ struct ethtool_rx_ntuple ntuplecmd; ++ struct ethtool_value eval; ++ int ret = 0; ++ ++ flow_spec_to_ntuple(fsp, &ntuplecmd.fs); ++ ++ eval.cmd = ETHTOOL_GFLAGS; ++ ret = send_ethtool_ioctl(ctx, &eval); ++ if (ret || !(eval.data & ETH_FLAG_NTUPLE)) ++ return -1; ++ ++ ntuplecmd.cmd = ETHTOOL_SRXNTUPLE; ++ ret = send_ethtool_ioctl(ctx, &ntuplecmd); ++ if (ret) ++ oecls_debug("Cannot add new rule via N-tuple, ret:%d\n", ret); ++ ++ return ret; ++} ++ ++static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del) ++{ ++ struct ethtool_rx_flow_spec *fsp, rx_rule_fs; ++ u32 rss_context = 0; ++ int ret; ++ ++ oecls_debug("is_del:%d netdev:%s, dip4:%pI4, dport:%d, action:%d, ruleid:%u, del_ruleid:%u\n", ++ is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, ctx->ruleid, ++ ctx->del_ruleid); ++ ++ if (is_del) ++ return rxclass_rule_del(ctx, ctx->del_ruleid); ++ ++ ctx->ret_loc = -1; ++ ++ fsp = &rx_rule_fs; ++ memset(fsp, 0, sizeof(*fsp)); ++ fsp->flow_type = TCP_V4_FLOW; ++ fsp->location = RX_CLS_LOC_ANY; ++ fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4; ++ fsp->h_u.tcp_ip4_spec.pdst = ctx->dport; ++ if (ctx->dip4) ++ fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL; ++ fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL; ++ if (ctx->ruleid) ++ fsp->location = ctx->ruleid; ++ fsp->ring_cookie = ctx->action; ++ ++ ret = do_srxntuple(ctx, &rx_rule_fs); ++ if (!ret) ++ return 0; ++ ++ ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context); ++ if (!ret) ++ ctx->ret_loc = rx_rule_fs.location; ++ return ret; ++} ++ ++static void del_ntuple_rule(struct sock *sk) ++{ ++ struct oecls_netdev_info *oecls_dev; ++ struct cmd_context ctx = { 0 }; ++ struct oecls_sk_rule *rule; ++ int devid; ++ u16 dport; ++ u32 dip4; ++ int err; ++ ++ get_sk_rule_addr(sk, &dip4, &dport); ++ ++ mutex_lock(&oecls_sk_rules.mutex); ++ for_each_oecls_netdev(devid, oecls_dev) { ++ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); ++ rule = get_rule_from_sk(devid, sk); ++ if (!rule) { ++ oecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", ++ sk, devid, &dip4, ntohs(dport)); ++ continue; ++ } ++ ++ // Config Ntuple rule to dev ++ ctx.del_ruleid = rule->ruleid; ++ err = cfg_ethtool_rule(&ctx, true); ++ if (err) { ++ oecls_error("del sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", ++ sk, rule->nid, devid, rule->action, rule->ruleid, err); ++ } ++ ++ // Free the bound queue ++ free_rxq_id(rule->nid, devid, rule->action); ++ ++ // Delete sk rule ++ del_sk_rule(rule); ++ } ++ mutex_unlock(&oecls_sk_rules.mutex); ++} ++ ++static void add_ntuple_rule(struct sock *sk) ++{ ++ struct oecls_netdev_info *oecls_dev; ++ struct cmd_context ctx = { 0 }; ++ int cpu = smp_processor_id(); ++ int nid = cpu_to_node(cpu); ++ int rxq_id; ++ int devid; ++ int err; ++ ++ if (check_appname(current->comm)) ++ return; ++ get_sk_rule_addr(sk, &ctx.dip4, &ctx.dport); ++ ++ mutex_lock(&oecls_sk_rules.mutex); ++ for_each_oecls_netdev(devid, oecls_dev) { ++ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); ++ if (reuseport_check(devid, ctx.dip4, ctx.dport)) { ++ oecls_error("dip4:%pI4, dport:%d reuse!\n", &ctx.dip4, ctx.dport); ++ continue; ++ } ++ ++ // Calculate the bound queue ++ rxq_id = alloc_rxq_id(nid, devid); ++ if (rxq_id < 0) ++ continue; ++ ++ // Config Ntuple rule to dev ++ ctx.action = (u16)rxq_id; ++ err = cfg_ethtool_rule(&ctx, false); ++ if (err) { ++ oecls_error("add sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", ++ sk, nid, devid, ctx.action, ctx.ret_loc, err); ++ continue; ++ } ++ ++ // Add sk rule ++ add_sk_rule(devid, ctx.dip4, ctx.dport, sk, ctx.action, ctx.ret_loc, nid); ++ } ++ mutex_unlock(&oecls_sk_rules.mutex); ++} ++ ++static void ethtool_cfg_rxcls(struct sock *sk, int is_del) ++{ ++ if (sk->sk_state != TCP_LISTEN) ++ return; ++ ++ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) ++ return; ++ ++ oecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, ip:%pI4, port:%d\n", smp_processor_id(), ++ current->comm, sk, is_del, &sk->sk_rcv_saddr, (u16)sk->sk_num); ++ ++ if (is_del) ++ del_ntuple_rule(sk); ++ else ++ add_ntuple_rule(sk); ++} ++ ++static void clean_oecls_sk_rules(void) ++{ ++ struct oecls_netdev_info *oecls_dev; ++ struct cmd_context ctx = { 0 }; ++ struct oecls_sk_rule *rule; ++ struct hlist_head *hlist; ++ struct hlist_node *n; ++ unsigned int i; ++ int err; ++ ++ mutex_lock(&oecls_sk_rules.mutex); ++ for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) { ++ hlist = &oecls_sk_rules.hash[i]; ++ ++ hlist_for_each_entry_safe(rule, n, hlist, node) { ++ oecls_dev = get_oecls_netdev_info(rule->devid); ++ if (!oecls_dev) ++ continue; ++ strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); ++ ctx.del_ruleid = rule->ruleid; ++ err = cfg_ethtool_rule(&ctx, true); ++ oecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, ++ rule->devid, rule->action, rule->ruleid, err); ++ ++ hlist_del(&rule->node); ++ oecls_debug("clean rule=%p\n", rule); ++ kfree(rule); ++ } ++ } ++ mutex_unlock(&oecls_sk_rules.mutex); ++} ++ ++static const struct oecls_hook_ops oecls_ntuple_ops = { ++ .oecls_flow_update = NULL, ++ .oecls_set_cpu = NULL, ++ .oecls_timeout = NULL, ++ .oecls_cfg_rxcls = ethtool_cfg_rxcls, ++}; ++ ++void oecls_ntuple_res_init(void) ++{ ++ init_oecls_sk_rules(); ++ RCU_INIT_POINTER(oecls_ops, &oecls_ntuple_ops); ++} ++ ++void oecls_ntuple_res_clean(void) ++{ ++ RCU_INIT_POINTER(oecls_ops, NULL); ++ clean_oecls_sk_rules(); ++} +-- +2.20.1 + -- 2.34.1
participants (1)
-
Yue Haibing