hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- This introduces a kind of network optimization method named venetcls. It can configure the ntuple rule, and bind interrupt to the netdev queue automatically. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> Signed-off-by: Wang Liang <wangliang74@huawei.com> Signed-off-by: Liu Jian <liujian56@huawei.com> Signed-off-by: yuelg <yuelg@chinaunicom.cn> --- MAINTAINERS | 5 + include/linux/netdevice.h | 3 + include/linux/venetcls.h | 101 +++ kernel/irq/irqdesc.c | 2 +- net/Kconfig | 1 + net/Makefile | 1 + net/core/dev.c | 23 + net/ipv4/af_inet.c | 6 + net/ipv4/tcp.c | 9 + net/venetcls/Kconfig | 11 + net/venetcls/Makefile | 7 + net/venetcls/asmdefs.S | 61 ++ net/venetcls/memcpy-sve.S | 157 +++++ net/venetcls/venetcls.h | 183 +++++ net/venetcls/venetcls_flow.c | 514 ++++++++++++++ net/venetcls/venetcls_main.c | 1154 ++++++++++++++++++++++++++++++++ net/venetcls/venetcls_ntuple.c | 713 ++++++++++++++++++++ 17 files changed, 2950 insertions(+), 1 deletion(-) create mode 100644 include/linux/venetcls.h create mode 100644 net/venetcls/Kconfig create mode 100644 net/venetcls/Makefile create mode 100644 net/venetcls/asmdefs.S create mode 100644 net/venetcls/memcpy-sve.S create mode 100644 net/venetcls/venetcls.h create mode 100644 net/venetcls/venetcls_flow.c create mode 100644 net/venetcls/venetcls_main.c create mode 100644 net/venetcls/venetcls_ntuple.c diff --git a/MAINTAINERS b/MAINTAINERS index ab1ff9b4195e..861b3418b947 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20568,6 +20568,11 @@ F: net/xdp/ F: samples/bpf/xdpsock* F: tools/lib/bpf/xsk* +VENETCLS +M: Yue Haibing <yuehaibing@huawei.com> +F: include/linux/venetcls.h +F: net/venetcls/ + XEN BLOCK SUBSYSTEM M: Roger Pau Monné <roger.pau@citrix.com> L: xen-devel@lists.xenproject.org (moderated for non-subscribers) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc1f14f3c236..e5f876cecf15 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -766,6 +766,9 @@ struct netdev_rx_queue { struct xsk_buff_pool *pool; #endif struct file __rcu *dmabuf_pages; +#if IS_ENABLED(CONFIG_VENETCLS) + void __rcu *vecls_ftb; +#endif } ____cacheline_aligned_in_smp; struct page * diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h new file mode 100644 index 000000000000..9cfcdd4e5766 --- /dev/null +++ b/include/linux/venetcls.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_VENETCLS_H +#define _LINUX_VENETCLS_H + +struct vecls_hook_ops { + void (*vecls_cfg_rxcls)(struct sock *sk, int is_del); + void (*vecls_flow_update)(struct sock *sk); + void (*vecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail); + bool (*vecls_timeout)(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id); +}; + +typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); +extern const struct vecls_hook_ops __rcu *vecls_ops; + +static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) +{ + const struct vecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_cfg_rxcls) + ops->vecls_cfg_rxcls(sk, is_del); + rcu_read_unlock(); +} + +static inline void venetcls_flow_update(struct sock *sk) +{ + const struct vecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_flow_update) + ops->vecls_flow_update(sk); + rcu_read_unlock(); +} + +static inline bool +venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) +{ + const struct vecls_hook_ops *ops; + int cpu, last_qtail; + bool result = false; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_set_cpu) { + cpu = -1; + last_qtail = 0; + ops->vecls_set_cpu(skb, &cpu, &last_qtail); + if (cpu >= 0) { + *ret = enq_func(skb, cpu, &last_qtail); + result = true; + } + } + rcu_read_unlock(); + return result; +} + +static inline void +venetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func) +{ + const struct vecls_hook_ops *ops; + struct sk_buff *skb, *next; + int cpu, last_qtail; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_set_cpu) { + list_for_each_entry_safe(skb, next, head, list) { + cpu = -1; + last_qtail = 0; + ops->vecls_set_cpu(skb, &cpu, &last_qtail); + if (cpu >= 0) { + skb_list_del_init(skb); + enq_func(skb, cpu, &last_qtail); + } + } + } + rcu_read_unlock(); +} + +static inline bool venetcls_may_expire_flow(struct net_device *dev, + u16 rxq_index, u32 flow_id, + u16 filter_id, bool *expire) +{ + const struct vecls_hook_ops *ops; + bool ret = false; + + *expire = true; + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_timeout) { + *expire = ops->vecls_timeout(dev, rxq_index, flow_id, filter_id); + ret = true; + } + rcu_read_unlock(); + return ret; +} + +#endif /* _LINUX_VENETCLS_H */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8202d4a996a5..eb8641e22575 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -366,7 +366,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } -#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE +#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_VENETCLS) EXPORT_SYMBOL_GPL(irq_to_desc); #endif diff --git a/net/Kconfig b/net/Kconfig index dc8451e75e4c..2b68c0f8625e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -72,6 +72,7 @@ source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" source "net/xdp/Kconfig" +source "net/venetcls/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/Makefile b/net/Makefile index 6a62e5b27378..a2cb1281e2a9 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_MCTP) += mctp/ +obj-$(CONFIG_VENETCLS) += venetcls/ diff --git a/net/core/dev.c b/net/core/dev.c index f628494a1c0f..47b916ca8d46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -160,6 +160,12 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) +#if IS_ENABLED(CONFIG_VENETCLS) +#include <linux/venetcls.h> +const struct vecls_hook_ops __rcu *vecls_ops __read_mostly; +EXPORT_SYMBOL_GPL(vecls_ops); +#endif + static DEFINE_SPINLOCK(ptype_lock); static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; @@ -4770,6 +4776,10 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, bool expire = true; unsigned int cpu; +#if IS_ENABLED(CONFIG_VENETCLS) + if (venetcls_may_expire_flow(dev, rxq_index, flow_id, filter_id, &expire)) + return expire; +#endif rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { @@ -5881,6 +5891,12 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } } +#endif +#if IS_ENABLED(CONFIG_VENETCLS) + if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { + rcu_read_unlock(); + return ret; + } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); @@ -5915,6 +5931,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) } } } +#endif +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_skblist_set_cpu(head, enqueue_to_backlog); #endif __netif_receive_skb_list(head); rcu_read_unlock(); @@ -10272,6 +10291,10 @@ int __netdev_update_features(struct net_device *dev) return err < 0 ? 0 : 1; } +#if IS_ENABLED(CONFIG_VENETCLS) +EXPORT_SYMBOL(__netdev_update_features); +#endif + static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5dc1955e38c4..06b917182a5a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -120,6 +120,9 @@ #include <net/compat.h> #include <trace/events/sock.h> +#if IS_ENABLED(CONFIG_VENETCLS) +#include <linux/venetcls.h> +#endif /* The inetsw table contains everything that inet_create needs to * build a new socket. @@ -229,6 +232,9 @@ int inet_listen(struct socket *sock, int backlog) if (err) goto out; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_cfg_rxcls(sk, 0); +#endif } err = 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e8b7f0c5dded..cc84873cee0d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -281,6 +281,9 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> #include <linux/dma-buf.h> +#if IS_ENABLED(CONFIG_VENETCLS) +#include <linux/venetcls.h> +#endif /* Track pending CMSGs. */ enum { @@ -2940,6 +2943,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_flow_update(sk); +#endif if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) @@ -3300,6 +3306,9 @@ void __tcp_close(struct sock *sk, long timeout) void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_cfg_rxcls(sk, 1); +#endif __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); diff --git a/net/venetcls/Kconfig b/net/venetcls/Kconfig new file mode 100644 index 000000000000..7f2ea5c4a6b0 --- /dev/null +++ b/net/venetcls/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +config VENETCLS + tristate "Network classification" + depends on MODULES + default n + help + This introduces a kind of network optimization method, which can + configure the flow steer rules, and bind interrupt to the netdev + queue automatically. + + This module can only be built as a loadable module. diff --git a/net/venetcls/Makefile b/net/venetcls/Makefile new file mode 100644 index 000000000000..639a81d7d6b2 --- /dev/null +++ b/net/venetcls/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_VENETCLS) = venetcls.o +venetcls-y := venetcls_main.o venetcls_ntuple.o venetcls_flow.o +ifeq ($(CONFIG_ARM64_SVE),y) +venetcls-y += memcpy-sve.o +endif diff --git a/net/venetcls/asmdefs.S b/net/venetcls/asmdefs.S new file mode 100644 index 000000000000..8138a94c18af --- /dev/null +++ b/net/venetcls/asmdefs.S @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name, %function; \ + .align alignment; \ +name: \ + .cfi_startproc; \ + BTI_C; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name, %function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#endif diff --git a/net/venetcls/memcpy-sve.S b/net/venetcls/memcpy-sve.S new file mode 100644 index 000000000000..0452ff8b3afb --- /dev/null +++ b/net/venetcls/memcpy-sve.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#include "asmdefs.S" + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h new file mode 100644 index 000000000000..14f02cd962c3 --- /dev/null +++ b/net/venetcls/venetcls.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _NET_VENETCLS_H +#define _NET_VENETCLS_H +#include <linux/if.h> +#include <linux/mutex.h> +#include <linux/cpufeature.h> + +#define VECLS_MAX_NETDEV_NUM 8 +#define VECLS_MAX_RXQ_NUM_PER_DEV 256 +#define VECLS_MAX_CPU_NUM 1024 + +#define VECLS_TIMEOUT (5 * HZ) +#define VECLS_NO_FILTER 0xffff +#define VECLS_NO_CPU 0xffff + +#define RXQ_MAX_USECNT 0xFF + +struct vecls_netdev_queue_info { + int irq; + int affinity_cpu; +}; + +struct vecls_netdev_info { + char dev_name[IFNAMSIZ]; + struct net_device *netdev; + int rxq_num; + struct vecls_netdev_queue_info rxq[VECLS_MAX_RXQ_NUM_PER_DEV]; + int old_filter_state; +}; + +struct vecls_rxq { + int rxq_id; + int status; +}; + +struct vecls_numa_clusterinfo { + int cluster_id; + int cur_freeidx; + struct vecls_rxq rxqs[VECLS_MAX_RXQ_NUM_PER_DEV]; +}; + +struct vecls_numa_bound_dev_info { + unsigned char bitmap_rxq[VECLS_MAX_RXQ_NUM_PER_DEV]; + struct vecls_numa_clusterinfo *cluster_info; +}; + +struct vecls_numa_info { + DECLARE_BITMAP(avail_cpus, VECLS_MAX_CPU_NUM); + struct vecls_numa_bound_dev_info bound_dev[VECLS_MAX_NETDEV_NUM]; +}; + +struct cmd_context { + char netdev[IFNAMSIZ]; + bool is_ipv6; + u32 dip4; + u32 dip6[4]; + u16 dport; + u16 action; + u32 ruleid; + u32 del_ruleid; + int ret_loc; +}; + +#define VECLS_SK_RULE_HASHSIZE 256 +#define VECLS_SK_RULE_HASHMASK (VECLS_SK_RULE_HASHSIZE - 1) + +struct vecls_sk_rule_list { + struct hlist_head hash[VECLS_SK_RULE_HASHSIZE]; + /* Mutex to synchronize access to ntuple rule locking */ + struct mutex mutex; +}; + +struct vecls_sk_rule { + struct hlist_node node; + int devid; + void *sk; + bool is_ipv6; + u32 dip4; + u32 dip6[4]; + u16 dport; + int action; + int ruleid; + int nid; +}; + +struct vecls_sk_entry { + struct hlist_node node; + void *sk; + u32 sk_rule_hash; +}; + +struct vecls_dev_flow { + unsigned short cpu; + unsigned short filter; + unsigned long timeout; + int isvalid; +}; + +struct vecls_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct vecls_dev_flow flows[]; +}; + +struct vecls_sock_flow_table { + u32 mask; + u32 ents[] ____cacheline_aligned_in_smp; +}; + +#define VECLS_DEV_FLOW_TABLE_NUM 0x1000 +#define VECLS_SOCK_FLOW_TABLE_NUM 0x100000 +#define VECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct vecls_dev_flow_table) + \ + ((_num) * sizeof(struct vecls_dev_flow))) +#define VECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct vecls_sock_flow_table, ents[_num])) + +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ + NETIF_F_RXHASH) + +struct rmgr_ctrl { + int driver_select; + unsigned long *slot; + __u32 n_rules; + __u32 size; +}; + +struct cfg_param { + struct work_struct work; + struct cmd_context ctx; + struct sock *sk; + bool is_del; + int nid; + int cpu; +}; + +extern int match_ip_flag; +extern int debug; +extern int vecls_netdev_num; +extern int vecls_numa_num; + +#define vecls_debug(fmt, ...) \ + do { \ + if (debug) \ + pr_info_ratelimited("venetcls [%s:%d]: " fmt,\ + __FILE__, __LINE__, ## __VA_ARGS__); \ + } while (0) + +#define vecls_error(fmt, ...) \ + pr_err_ratelimited("venetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__) + +struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index); +struct vecls_numa_info *get_vecls_numa_info(unsigned int nid); + +#ifdef CONFIG_ARM64_SVE +void *__memcpy_aarch64_sve(void *, const void *, size_t); +#define memcpy_r(dst, src, len) \ + do { \ + void *_dst = dst; \ + const void *_src = src; \ + size_t _len = len; \ + if (system_supports_sve()) \ + __memcpy_aarch64_sve(_dst, _src, _len); \ + else \ + memcpy(_dst, _src, _len); \ + } while (0) +#else +#define memcpy_r(dst, src, len) memcpy(dst, src, len) +#endif + +int check_appname(char *task_name); +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd); +int alloc_rxq_id(int nid, int cpu, int devid); +void free_rxq_id(int nid, int devid, int rxq_id); +int vecls_ntuple_res_init(void); +void vecls_ntuple_res_clean(void); +int venetcls_ntuple_status(struct seq_file *seq, void *v); +int vecls_flow_res_init(void); +void vecls_flow_res_clean(void); +int venetcls_flow_status(struct seq_file *seq, void *v); + +#endif /* _NET_VENETCLS_H */ diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c new file mode 100644 index 000000000000..242254b9bfe0 --- /dev/null +++ b/net/venetcls/venetcls_flow.c @@ -0,0 +1,514 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/inetdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/inet.h> +#include <linux/venetcls.h> +#include <net/sock.h> + +#include "venetcls.h" + +static u32 vecls_cpu_mask; +static struct vecls_sock_flow_table __rcu *vecls_sock_flow_table; +static DEFINE_MUTEX(vecls_sock_flow_mutex); +static DEFINE_SPINLOCK(vecls_dev_flow_lock); + +bool is_vecls_config_netdev(const char *name) +{ + struct vecls_netdev_info *vecls_dev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + if (strcmp(vecls_dev->dev_name, name) == 0) + return true; + } + + return false; +} + +static bool _vecls_timeout(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct vecls_dev_flow_table *flow_table; + struct vecls_dev_flow *rflow; + bool expire = true; + unsigned int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->vecls_ftb); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = READ_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu < nr_cpu_ids) { + if (time_before(jiffies, rflow->timeout + VECLS_TIMEOUT)) { + expire = false; + } else { + rflow->isvalid = 0; + WRITE_ONCE(rflow->cpu, VECLS_NO_CPU); + } + } + } + rcu_read_unlock(); + if (expire) + vecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__, + dev->name, rxq_index, flow_id, filter_id, expire); + return expire; +} + +static void _vecls_flow_update(struct sock *sk) +{ + struct vecls_sock_flow_table *tb; + unsigned int hash, index; + u32 val; + u32 cpu = raw_smp_processor_id(); + + if (sk->sk_state != TCP_ESTABLISHED) + return; + + if (check_appname(current->comm)) + return; + + rcu_read_lock(); + tb = rcu_dereference(vecls_sock_flow_table); + hash = READ_ONCE(sk->sk_rxhash); + if (tb && hash) { + index = hash & tb->mask; + val = hash & ~vecls_cpu_mask; + val |= cpu; + + if (READ_ONCE(tb->ents[index]) != val) + WRITE_ONCE(tb->ents[index], val); + } + rcu_read_unlock(); +} + +static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb) +{ + struct vecls_numa_bound_dev_info *bound_dev = NULL; + struct vecls_netdev_info *vecls_dev; + struct vecls_numa_info *numa_info; + int i, devid, rxq_num, rxq_id; + u32 hash, index; + + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + return -1; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + if (strcmp(vecls_dev->dev_name, dev->name) == 0) { + bound_dev = &numa_info->bound_dev[devid]; + break; + } + } + if (!bound_dev) + return -1; + + rxq_num = 0; + for (i = 0; i < VECLS_MAX_RXQ_NUM_PER_DEV; i++) { + if (bound_dev->bitmap_rxq[i] == RXQ_MAX_USECNT) + continue; + rxq_num++; + } + if (rxq_num == 0) + return -1; + hash = skb_get_hash(skb); + index = hash % rxq_num; + + i = 0; + for (rxq_id = 0; rxq_id < VECLS_MAX_RXQ_NUM_PER_DEV; rxq_id++) { + if (bound_dev->bitmap_rxq[rxq_id] == RXQ_MAX_USECNT) + continue; + if (i++ == index) + return rxq_id; + } + + vecls_debug("%s skb:%p, no found rxq\n", __func__, skb); + return -1; +} + +static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, + struct vecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu) +{ + struct netdev_rx_queue *rxqueue; + struct vecls_dev_flow_table *dtb; + struct vecls_dev_flow *rflow; + u32 flow_id, hash; + int rxq_index, rc; + + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + return; + + rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); + if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) { + vecls_debug("%s skb:%p, old_rxq:%d, next_cpu:%d new_rxq:%d\n", + __func__, skb, old_rxq_id, next_cpu, rxq_index); + return; + } + + rxqueue = dev->_rx + rxq_index; + dtb = rcu_dereference(rxqueue->vecls_ftb); + if (!dtb) + return; + + hash = skb_get_hash(skb); + flow_id = hash & dtb->mask; + rflow = &dtb->flows[flow_id]; + + if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(next_cpu)) { + rflow->timeout = jiffies; + return; + } + + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); + if (rc < 0) { + vecls_debug("skb:%p rxq:%d hash:0x%x flow_id:%u old_rxq:%d rflow->cpu:%d rflow->isvalid:%d next_cpu:%d rc:%d\n", + skb, rxq_index, hash, flow_id, old_rxq_id, rflow->cpu, + rflow->isvalid, next_cpu, rc); + return; + } + + rflow->filter = rc; + rflow->isvalid = 1; + rflow->timeout = jiffies; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = VECLS_NO_FILTER; + rflow->cpu = next_cpu; +} + +static int get_cpu_in_numa(int tcpu, u32 hash) +{ + const struct cpumask *mask; + int nr_cpus, cpu, index; + + mask = cpumask_of_node(cpu_to_node(tcpu)); + nr_cpus = cpumask_weight(mask); + if (nr_cpus == 0) + return -1; + + index = reciprocal_scale(hash, nr_cpus); + if (index < 0) + return -1; + + cpu = cpumask_first(mask); + while (--nr_cpus > 0) { + if (index == 0) + break; + cpu = cpumask_next(cpu, mask); + index--; + } + return cpu; +} + +static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, + struct vecls_sock_flow_table *tb, struct vecls_dev_flow_table *dtb, + int old_rxq_id, int *rcpu, int *last_qtail) +{ + u32 last_recv_cpu, hash, val, cpu, tcpu; + struct vecls_dev_flow *rflow; + int newcpu; + + cpu = raw_smp_processor_id(); + skb_reset_network_header(skb); + hash = skb_get_hash(skb); + if (!hash) + return; + + val = READ_ONCE(tb->ents[hash & tb->mask]); + last_recv_cpu = val & vecls_cpu_mask; + rflow = &dtb->flows[hash & dtb->mask]; + tcpu = rflow->cpu; + + if ((val ^ hash) & ~vecls_cpu_mask) + return; + + newcpu = get_cpu_in_numa(last_recv_cpu, hash); + if (newcpu >= 0) + *rcpu = newcpu; + else + newcpu = last_recv_cpu; + + if (cpu_to_node(cpu) == cpu_to_node(newcpu)) + return; + + if (tcpu >= nr_cpu_ids) + set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); +} + +static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) +{ + struct net_device *ndev = skb->dev; + struct vecls_sock_flow_table *stb; + struct vecls_dev_flow_table *dtb; + struct netdev_rx_queue *rxqueue; + int rxq_id = -1; + + *cpu = -1; + last_qtail = 0;//unused + if (!ndev) + return; + + if (!is_vecls_config_netdev(ndev->name)) + return; + + rxqueue = ndev->_rx; + if (skb_rx_queue_recorded(skb)) { + rxq_id = skb_get_rx_queue(skb); + if (rxq_id >= ndev->real_num_rx_queues) { + vecls_debug("%s ndev:%s rxq:%d real_num:%d\n", __func__, + ndev->name, rxq_id, ndev->real_num_rx_queues); + return; + } + rxqueue += rxq_id; + } + + if (rxq_id < 0) + return; + + rcu_read_lock(); + stb = rcu_dereference(vecls_sock_flow_table); + dtb = rcu_dereference(rxqueue->vecls_ftb); + if (stb && dtb) + __vecls_set_cpu(skb, ndev, stb, dtb, rxq_id, cpu, last_qtail); + rcu_read_unlock(); +} + +static void vecls_dev_flow_table_free(struct rcu_head *rcu) +{ + struct vecls_dev_flow_table *table = container_of(rcu, + struct vecls_dev_flow_table, rcu); + vfree(table); +} + +static void vecls_dev_flow_table_cleanup(struct net_device *netdev, int queues) +{ + struct vecls_dev_flow_table *dtb; + struct netdev_rx_queue *queue; + int i; + + for (i = 0; i < queues; i++) { + queue = netdev->_rx + i; + spin_lock(&vecls_dev_flow_lock); + dtb = rcu_dereference_protected(queue->vecls_ftb, + lockdep_is_held(&vecls_dev_flow_lock)); + rcu_assign_pointer(queue->vecls_ftb, NULL); + spin_unlock(&vecls_dev_flow_lock); + if (dtb) + call_rcu(&dtb->rcu, vecls_dev_flow_table_free); + } +} + +static int vecls_dev_flow_table_release(void) +{ + struct vecls_netdev_info *vecls_dev; + struct net_device *netdev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + netdev = vecls_dev->netdev; + if (!netdev) + continue; + vecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues); + } + + return 0; +} + +static int _vecls_dev_flow_table_init(struct net_device *netdev) +{ + struct vecls_dev_flow_table *table; + int size = VECLS_DEV_FLOW_TABLE_NUM; + struct netdev_rx_queue *queue; + int i, j, ret = 0; + + size = roundup_pow_of_two(size); + vecls_debug("%s dev:%s num_rx_queues:%d mask:0x%x\n", + __func__, netdev->name, netdev->num_rx_queues, size - 1); + + for (i = 0; i < netdev->num_rx_queues; i++) { + table = vmalloc(VECLS_DEV_FLOW_TABLE_SIZE(size)); + if (!table) { + ret = -ENOMEM; + goto fail; + } + + table->mask = size - 1; + for (j = 0; j < size; j++) { + table->flows[j].cpu = VECLS_NO_CPU; + table->flows[j].isvalid = 0; + } + + queue = netdev->_rx + i; + + spin_lock(&vecls_dev_flow_lock); + rcu_assign_pointer(queue->vecls_ftb, table); + spin_unlock(&vecls_dev_flow_lock); + } + return ret; +fail: + vecls_dev_flow_table_cleanup(netdev, i); + return ret; +} + +static int vecls_dev_flow_table_init(void) +{ + struct vecls_netdev_info *vecls_dev; + struct net_device *ndev; + int i, err, devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + ndev = vecls_dev->netdev; + if (!ndev) + continue; + err = _vecls_dev_flow_table_init(ndev); + if (err) + goto out; + } + + return 0; +out: + for (i = 0; i < devid; i++) { + vecls_dev = get_vecls_netdev_info(i); + ndev = vecls_dev->netdev; + if (!ndev) + continue; + vecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues); + } + return err; +} + +static const struct vecls_hook_ops vecls_flow_ops = { + .vecls_flow_update = _vecls_flow_update, + .vecls_set_cpu = _vecls_set_cpu, + .vecls_timeout = _vecls_timeout, + .vecls_cfg_rxcls = NULL, +}; + +static int vecls_sock_flow_table_release(void) +{ + struct vecls_sock_flow_table *tb; + + mutex_lock(&vecls_sock_flow_mutex); + tb = rcu_dereference_protected(vecls_sock_flow_table, + lockdep_is_held(&vecls_sock_flow_mutex)); + if (tb) + rcu_assign_pointer(vecls_sock_flow_table, NULL); + mutex_unlock(&vecls_sock_flow_mutex); + synchronize_rcu(); + vfree(tb); + + return 0; +} + +int venetcls_flow_status(struct seq_file *seq, void *v) +{ + struct vecls_netdev_info *vecls_dev; + struct vecls_dev_flow_table *dtb; + struct netdev_rx_queue *queue; + struct net_device *netdev; + int devid, i, j; + unsigned long timeout; + + seq_printf(seq, "%-16s %-6s %-12s %-12s %-12s\n", + "Interface", "rxq", "flowCPU", "filterId", "timeout"); + spin_lock(&vecls_dev_flow_lock); + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + netdev = vecls_dev->netdev; + if (!netdev) + continue; + for (i = 0; i < netdev->num_rx_queues; i++) { + queue = netdev->_rx + i; + dtb = rcu_dereference_protected(queue->vecls_ftb, + lockdep_is_held(&vecls_dev_flow_lock)); + if (!dtb) + continue; + for (j = 0; j < VECLS_DEV_FLOW_TABLE_NUM; j++) { + if (dtb->flows[j].cpu == VECLS_NO_CPU) + continue; + if (dtb->flows[j].isvalid == 0) + continue; + timeout = dtb->flows[j].timeout + VECLS_TIMEOUT; + if (time_before(jiffies, timeout)) { + seq_printf(seq, "%-16s %-6d %-12d %-12d %-12u\n", + vecls_dev->dev_name, i, dtb->flows[j].cpu, + dtb->flows[j].filter, + jiffies_to_msecs(timeout - jiffies)); + } + } + } + } + spin_unlock(&vecls_dev_flow_lock); + + return 0; +} + +static int vecls_sock_flow_table_init(void) +{ + struct vecls_sock_flow_table *table; + int size = VECLS_SOCK_FLOW_TABLE_NUM; + int i; + + size = roundup_pow_of_two(size); + table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size)); + if (!table) + return -ENOMEM; + + vecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + vecls_debug("nr_cpu_ids:%d, vecls_cpu_mask:0x%x\n", nr_cpu_ids, vecls_cpu_mask); + + table->mask = size - 1; + for (i = 0; i < size; i++) + table->ents[i] = VECLS_NO_CPU; + + mutex_lock(&vecls_sock_flow_mutex); + rcu_assign_pointer(vecls_sock_flow_table, table); + mutex_unlock(&vecls_sock_flow_mutex); + + return 0; +} + +int vecls_flow_res_init(void) +{ + int err; + + err = vecls_sock_flow_table_init(); + if (err) + return err; + err = vecls_dev_flow_table_init(); + if (err) + goto clean; + + RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); + synchronize_rcu(); + + return 0; +clean: + vecls_sock_flow_table_release(); + return err; +} + +void vecls_flow_res_clean(void) +{ + RCU_INIT_POINTER(vecls_ops, NULL); + synchronize_rcu(); + vecls_sock_flow_table_release(); + vecls_dev_flow_table_release(); +} diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c new file mode 100644 index 000000000000..856ff3b4427e --- /dev/null +++ b/net/venetcls/venetcls_main.c @@ -0,0 +1,1154 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netdev_features.h> +#include <linux/ethtool.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/proc_fs.h> +#include <linux/rtnetlink.h> +#include <linux/seq_file.h> +#include "venetcls.h" + +int vecls_netdev_num; +static struct vecls_netdev_info vecls_netdev_info_table[VECLS_MAX_NETDEV_NUM]; + +int vecls_numa_num; +static int vecls_cluster_cpu_num, vecls_cluster_per_numa; +static struct vecls_numa_info *vecls_numa_info_table; + +int debug; +module_param(debug, int, 0644); +MODULE_PARM_DESC(debug, "debug switch"); + +static int mode; +module_param(mode, int, 0444); +MODULE_PARM_DESC(mode, "mode, default 0"); + +static char ifname[64] = { 0 }; +module_param_string(ifname, ifname, sizeof(ifname), 0444); +MODULE_PARM_DESC(ifname, "ifname"); + +static char appname[64] = "redis-server"; +module_param_string(appname, appname, sizeof(appname), 0644); +MODULE_PARM_DESC(appname, "appname, default redis-server"); + +int match_ip_flag = 1; +module_param(match_ip_flag, int, 0644); +MODULE_PARM_DESC(match_ip_flag, "match ip flag"); + +static int strategy; +module_param(strategy, int, 0444); +MODULE_PARM_DESC(strategy, "strategy, default 0"); + +static int rxq_multiplex_limit = 1; +module_param(rxq_multiplex_limit, int, 0444); +MODULE_PARM_DESC(rxq_multiplex_limit, "rxq multiplex limit num, default 1"); + +static char irqname[64] = "comp"; +module_param_string(irqname, irqname, sizeof(irqname), 0644); +MODULE_PARM_DESC(irqname, "nic irq name string, default comp"); + +static bool check_params(void) +{ + if (mode != 0 && mode != 1) + return false; + + if (strlen(ifname) == 0) + return false; + + return true; +} + +int check_appname(char *task_name) +{ + char *start = appname, *end; + + if (!strlen(appname)) + return 0; + + // support appname: app1#app2#appN + while (*start != '\0') { + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + if (!end) { + if (!strncmp(task_name, start, strlen(start))) + return 0; + break; + } + + if (!strncmp(task_name, start, end - start)) + return 0; + start = end + 1; + } + return -EOPNOTSUPP; +} + +static u32 __ethtool_get_flags(struct net_device *dev) +{ + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) + flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) + flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) + flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) + flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) + flags |= ETH_FLAG_RXHASH; + + return flags; +} + +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + netdev_features_t features = 0, changed; + + if (data & ~ETH_ALL_FLAGS) + return -EINVAL; + + if (data & ETH_FLAG_LRO) + features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_RX; + if (data & ETH_FLAG_TXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_TX; + if (data & ETH_FLAG_NTUPLE) + features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) + features |= NETIF_F_RXHASH; + + /* allow changing only bits set in hw_features */ + changed = (features ^ dev->features) & ETH_ALL_FEATURES; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (features & changed); + + __netdev_update_features(dev); + + return 0; +} + +static void ethtool_rxnfc_copy_to_user(void *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + memcpy_r(useraddr, rxnfc, size); + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + + if (rule_buf) + memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32)); +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + int rc; + + if (!dev->ethtool_ops->set_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (rc) + return rc; + + if (cmd == ETHTOOL_SRXCLSRLINS) + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); + + return 0; +} + +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + memcpy_r(&info, useraddr, info_size); + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info.flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info.cmd != cmd) + return -EINVAL; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kcalloc(info.rule_cnt, sizeof(u32), + GFP_KERNEL); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); +err_out: + kfree(rule_buf); + + return ret; +} + +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + memcpy_r(useraddr, &channels, sizeof(channels)); + return 0; +} + +static int ethtool_get_value(struct net_device *dev, char *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { .cmd = cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + memcpy_r(useraddr, &edata, sizeof(edata)); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + memcpy_r(&edata, useraddr, sizeof(edata)); + + return actor(dev, edata.data); +} + +static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + void *useraddr = ifr->ifr_data; + u32 ethcmd, sub_cmd; + int rc; + netdev_features_t old_features; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + memcpy_r(ðcmd, useraddr, sizeof(ethcmd)); + + if (ethcmd == ETHTOOL_PERQUEUE) + memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)); + else + sub_cmd = ethcmd; + + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) + return rc; + } + old_features = dev->features; + + switch (ethcmd) { + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + __ethtool_get_flags); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); + break; + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + + return rc; +} + +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd) +{ + struct ifreq ifr = {0}; + int ret; + + strscpy(ifr.ifr_name, ctx->netdev, IFNAMSIZ); + ifr.ifr_data = cmd; + + rtnl_lock(); + ret = dev_ethtool_kern(&init_net, &ifr); + rtnl_unlock(); + + return ret; +} + +struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index) +{ + if (index >= VECLS_MAX_NETDEV_NUM) + return NULL; + return &vecls_netdev_info_table[index]; +} + +static struct vecls_netdev_info *alloc_vecls_netdev_info(void) +{ + if (vecls_netdev_num >= VECLS_MAX_NETDEV_NUM) + return NULL; + + return &vecls_netdev_info_table[vecls_netdev_num++]; +} + +static bool check_irq_name(const char *irq_name, struct vecls_netdev_info *vecls_dev) +{ + if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx") && + strlen(irqname) > 0 && !strstr(irq_name, irqname)) + return false; + + if (strstr(irq_name, vecls_dev->dev_name)) + return true; + + if (vecls_dev->netdev->dev.parent && + strstr(irq_name, dev_name(vecls_dev->netdev->dev.parent))) + return true; + + return false; +} + +static void get_netdev_queue_info(struct vecls_netdev_info *vecls_dev) +{ + struct vecls_netdev_queue_info *rxq_info; + struct irq_desc *desc; + int irq, cpu; + + for_each_irq_desc(irq, desc) { + if (!desc->action) + continue; + if (!desc->action->name) + continue; + if (!check_irq_name(desc->action->name, vecls_dev)) + continue; + if (vecls_dev->rxq_num >= VECLS_MAX_RXQ_NUM_PER_DEV) + break; + rxq_info = &vecls_dev->rxq[vecls_dev->rxq_num++]; + rxq_info->irq = irq; + cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data)); + rxq_info->affinity_cpu = cpu; + vecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n", + irq, desc->action->name, vecls_dev->rxq_num - 1, cpu); + } +} + +static int vecls_filter_enable(const char *dev_name, bool *old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + int ret; + + strscpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (eval.data & ETH_FLAG_NTUPLE) { + *old_state = true; + vecls_debug("%s ntuple is already on\n", dev_name); + return 0; + } + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + eval.data |= ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + + // Get ntuple feature + eval.cmd = ETHTOOL_GFLAGS; + eval.data = 0; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (!(eval.data & ETH_FLAG_NTUPLE)) { + vecls_error("enable ntuple feature fail!\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static void vecls_filter_restore(const char *dev_name, bool old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + bool cur_filter_state; + int ret; + + strscpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return; + } + + cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false; + if (cur_filter_state == old_state) + return; + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + if (old_state) + eval.data |= ETH_FLAG_NTUPLE; + else + eval.data &= ~ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return; + } +} + +static int init_single_vecls_dev(char *if_name, unsigned int length) +{ + struct vecls_netdev_info *vecls_dev; + char dev_name[IFNAMSIZ] = { 0 }; + struct net_device *netdev; + bool old_state = false; + int ret; + + strscpy(dev_name, if_name, IFNAMSIZ); + netdev = dev_get_by_name(&init_net, dev_name); + if (!netdev) { + vecls_error("dev [%s] is not exist!\n", dev_name); + return -ENODEV; + } + + if (!(netdev->flags & IFF_UP)) { + ret = -ENETDOWN; + vecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags); + goto out; + } + + if (netdev->flags & IFF_LOOPBACK) { + ret = -EOPNOTSUPP; + vecls_error("Do not support loopback.\n"); + goto out; + } + + ret = vecls_filter_enable(dev_name, &old_state); + if (ret) { + vecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); + goto out; + } + + vecls_dev = alloc_vecls_netdev_info(); + if (!vecls_dev) { + ret = -ENOMEM; + vecls_filter_restore(dev_name, old_state); + vecls_error("alloc vecls_dev fail! vecls_netdev_num:%d\n", vecls_netdev_num); + goto out; + } + + memcpy_r(vecls_dev->dev_name, dev_name, IFNAMSIZ); + vecls_dev->old_filter_state = old_state; + vecls_dev->netdev = netdev; + get_netdev_queue_info(vecls_dev); + return 0; + +out: + dev_put(netdev); + return ret; +} + +static void clean_vecls_netdev_info(void) +{ + struct vecls_netdev_info *vecls_dev; + struct net_device *netdev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + vecls_filter_restore(vecls_dev->dev_name, vecls_dev->old_filter_state); + netdev = vecls_dev->netdev; + if (netdev) { + vecls_dev->netdev = NULL; + dev_put(netdev); + } + } + + vecls_netdev_num = 0; +} + +static int init_vecls_netdev_info(char *netdev_str) +{ + char *start = netdev_str, *end; + int err = -ENODEV; + + while (*start != '\0') { + // skip start # + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + // find the last ifname + if (!end) { + err = init_single_vecls_dev(start, strlen(start)); + break; + } + + err = init_single_vecls_dev(start, end - start); + if (err) + break; + start = end + 1; + } + + return err; +} + +struct vecls_numa_info *get_vecls_numa_info(unsigned int nid) +{ + if (nid >= vecls_numa_num) + return NULL; + return &vecls_numa_info_table[nid]; +} + +static void clean_vecls_numa_info(void) +{ + vecls_numa_num = 0; + kfree(vecls_numa_info_table); +} + +static void init_numa_avail_cpus(int nid, struct vecls_numa_info *numa_info) +{ + int cpu; + + vecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)), + cpumask_pr_args(cpumask_of_node(nid))); + + bitmap_zero(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); + for_each_cpu(cpu, cpumask_of_node(nid)) { + if (cpu >= VECLS_MAX_CPU_NUM) + return; + set_bit(cpu, numa_info->avail_cpus); + } +} + +static void clean_vecls_rxq(void) +{ + struct vecls_numa_bound_dev_info *bound_dev; + struct vecls_netdev_info *vecls_dev; + struct vecls_numa_info *numa_info; + int nid, devid; + + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + bound_dev = &numa_info->bound_dev[devid]; + kfree(bound_dev->cluster_info); + } + } +} + +static int init_numa_rxq_bitmap(int nid, struct vecls_numa_info *numa_info) +{ + int bound_rxq_num, cluster_id, cluster_idx, cur_idx; + struct vecls_numa_bound_dev_info *bound_dev; + struct vecls_netdev_info *vecls_dev; + int i, j, rxq_id, devid, cpu, ret = 0; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + bound_rxq_num = 0; + bound_dev = &numa_info->bound_dev[devid]; + memset(bound_dev->bitmap_rxq, RXQ_MAX_USECNT, sizeof(bound_dev->bitmap_rxq)); + bound_dev->cluster_info = kcalloc(vecls_cluster_per_numa, + sizeof(*bound_dev->cluster_info), GFP_ATOMIC); + if (!bound_dev->cluster_info) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < vecls_cluster_per_numa; i++) { + for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { + bound_dev->cluster_info[i].rxqs[j].rxq_id = -1; + bound_dev->cluster_info[i].rxqs[j].status = RXQ_MAX_USECNT; + } + } + + for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { + cpu = vecls_dev->rxq[rxq_id].affinity_cpu; + if (cpu_to_node(cpu) == nid) { + bound_dev->bitmap_rxq[rxq_id] = 0; + cluster_id = cpu / vecls_cluster_cpu_num; + cluster_idx = cluster_id % vecls_cluster_per_numa; + bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id; + cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++; + bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id; + bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 0; + bound_rxq_num++; + vecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n", + cpu, cluster_id, cluster_idx, rxq_id, cur_idx); + } + } + + vecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bound_rxq_num:%d\n", + nid, devid, vecls_dev->dev_name, vecls_dev->rxq_num, bound_rxq_num); + } + return ret; + +out: + clean_vecls_rxq(); + return ret; +} + +static int get_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int cpu) +{ + int cluster_id = cpu / vecls_cluster_cpu_num; + int min_used_count = RXQ_MAX_USECNT; + int i, j, rxq_id; + + for (i = 0; i < vecls_cluster_per_numa; i++) { + if (cluster_id != bound_dev->cluster_info[i].cluster_id) + continue; + for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { + if (bound_dev->cluster_info[i].rxqs[j].rxq_id == -1) + continue; + if (bound_dev->cluster_info[i].rxqs[j].status < min_used_count) { + min_used_count = bound_dev->cluster_info[i].rxqs[j].status; + break; + } + } + if (min_used_count >= RXQ_MAX_USECNT || min_used_count >= rxq_multiplex_limit) { + rxq_id = -1; + vecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu); + } else { + rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id; + bound_dev->cluster_info[i].rxqs[j].status++; + vecls_debug("cluster:%d cpu:%d alloc rxq_id:%d use:%d\n", cluster_id, cpu, + rxq_id, bound_dev->cluster_info[i].rxqs[j].status); + } + } + vecls_debug("%s allcluster:%d rxq:%d for cpu:%d\n", __func__, cluster_id, rxq_id, cpu); + return rxq_id; +} + +static int put_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int rxq_id) +{ + int i, j; + + for (i = 0; i < vecls_cluster_per_numa; i++) { + for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { + if (bound_dev->cluster_info[i].rxqs[j].status > 0 && + bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) { + bound_dev->cluster_info[i].rxqs[j].status--; + vecls_debug("free rxq_id:%d use:%d\n", rxq_id, + bound_dev->cluster_info[i].rxqs[j].status); + return 0; + } + } + } + vecls_debug("no match malloced rxq_id:%d\n", rxq_id); + return -1; +} + +int alloc_rxq_id(int nid, int cpu, int devid) +{ + struct vecls_numa_bound_dev_info *bound_dev; + int i, rxq_id, min_used_count = RXQ_MAX_USECNT; + struct vecls_numa_info *numa_info; + + numa_info = get_vecls_numa_info(nid); + if (!numa_info) { + vecls_error("error nid:%d\n", nid); + return -EINVAL; + } + + if (devid >= VECLS_MAX_NETDEV_NUM) { + vecls_error("error bound_dev index:%d\n", devid); + return -EINVAL; + } + bound_dev = &numa_info->bound_dev[devid]; + + if (strategy == 1) { + rxq_id = get_cluster_rxq(bound_dev, cpu); + if (rxq_id < 0 || rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) + vecls_debug("failed to get rxq_id:%d in cluster, try numa\n", rxq_id); + else + goto found; + } + + for (i = 0; i < VECLS_MAX_RXQ_NUM_PER_DEV; i++) { + if (bound_dev->bitmap_rxq[i] < min_used_count) { + min_used_count = bound_dev->bitmap_rxq[i]; + rxq_id = i; + } + } + if (min_used_count >= RXQ_MAX_USECNT || min_used_count >= rxq_multiplex_limit) { + vecls_error("alloc rxq fail! nid:%d, devid:%d\n", nid, devid); + return -EINVAL; + } + +found: + bound_dev->bitmap_rxq[rxq_id]++; + vecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d use:%d\n", nid, devid, + rxq_id, bound_dev->bitmap_rxq[rxq_id]); + return rxq_id; +} + +void free_rxq_id(int nid, int devid, int rxq_id) +{ + struct vecls_numa_bound_dev_info *bound_dev; + struct vecls_numa_info *numa_info; + + numa_info = get_vecls_numa_info(nid); + if (!numa_info) { + vecls_error("error nid:%d\n", nid); + return; + } + + if (devid >= VECLS_MAX_NETDEV_NUM) { + vecls_error("error bound_dev index:%d\n", devid); + return; + } + bound_dev = &numa_info->bound_dev[devid]; + + if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) { + vecls_error("error rxq_id:%d\n", rxq_id); + return; + } + + if (strategy == 1) + put_cluster_rxq(bound_dev, rxq_id); + + if (bound_dev->bitmap_rxq[rxq_id] <= 0) { + vecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id); + return; + } + + bound_dev->bitmap_rxq[rxq_id]--; + vecls_debug("free nid:%d, dev_id:%d, rxq_id:%d use:%d\n", nid, devid, + rxq_id, bound_dev->bitmap_rxq[rxq_id]); +} + +static int init_vecls_numa_info(void) +{ + struct vecls_numa_info *numa_info; + int nid, ret = 0; + + vecls_numa_num = num_online_nodes(); + vecls_numa_info_table = kcalloc(vecls_numa_num, sizeof(*vecls_numa_info_table), GFP_ATOMIC); + if (!vecls_numa_info_table) { + ret = -ENOMEM; + vecls_error("vecls_numa_info_table alloc failed:%d\n", ret); + return ret; + } + + vecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(raw_smp_processor_id())); + vecls_cluster_per_numa = (nr_cpu_ids / vecls_cluster_cpu_num) / vecls_numa_num; + vecls_debug("vecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n", + vecls_numa_num, vecls_cluster_per_numa, vecls_cluster_cpu_num); + + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + init_numa_avail_cpus(nid, numa_info); + } + + return ret; +} + +static int alloc_available_cpu(int nid, struct vecls_numa_info *numa_info) +{ + int cpu; + + cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); + if (cpu >= VECLS_MAX_CPU_NUM) { + vecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu); + return -1; + } + + clear_bit(cpu, numa_info->avail_cpus); + return cpu; +} + +static void add_netdev_irq_affinity_cpu(struct vecls_netdev_info *vecls_dev, int rxq_id, int cpu) +{ + struct vecls_netdev_queue_info *rxq_info; + + if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) + return; + + rxq_info = &vecls_dev->rxq[rxq_id]; + rxq_info->affinity_cpu = cpu; +} + +static void config_affinity_strategy_default(struct vecls_netdev_info *vecls_dev) +{ + struct vecls_numa_info *numa_info; + int rxq_num = vecls_dev->rxq_num; + int rxq_per_numa = rxq_num / vecls_numa_num; + int remain = rxq_num - rxq_per_numa * vecls_numa_num; + int numa_rxq_id, rxq_id, nid, cpu; + + vecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", vecls_dev->dev_name, + rxq_num, rxq_per_numa, remain); + + // average config rxq to every numa + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) { + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * nid + numa_rxq_id; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu); + } + } + + if (!remain) + return; + + // config remain rxq to every numa + numa_rxq_id = 0; + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + if (numa_rxq_id >= remain) + break; + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * vecls_numa_num + numa_rxq_id; + numa_rxq_id++; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu); + } +} + +static void config_affinity_strategy_cluster(struct vecls_netdev_info *vecls_dev) +{ + int rxq_num = vecls_dev->rxq_num; + int rxq_per_numa = rxq_num / vecls_numa_num; + int remain = rxq_num - rxq_per_numa * vecls_numa_num; + int cpu_idx = vecls_cluster_cpu_num - 1; + int cluster, cpu, rxq_id = 0, round; + + round = rxq_per_numa < vecls_cluster_per_numa ? rxq_per_numa : vecls_cluster_per_numa; + if (remain > 0) + round++; + vecls_debug("round=%d\n", round); + + while (rxq_id < vecls_dev->rxq_num) { + for (cluster = 0; cluster < vecls_cluster_per_numa * vecls_numa_num; cluster++) { + if (cluster % vecls_cluster_per_numa >= round) + continue; + cpu = cluster * vecls_cluster_cpu_num + cpu_idx; + if (rxq_id >= vecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); + } + cpu_idx--; + if (--cpu_idx < 0) + cpu_idx = vecls_cluster_cpu_num - 1; + } +} + +static void config_affinity_strategy_numa(struct vecls_netdev_info *vecls_dev) +{ + int rxq_num = vecls_dev->rxq_num; + int rxq_per_numa = rxq_num / vecls_numa_num; + int cpu_per_numa = nr_cpu_ids / vecls_numa_num; + int remain = rxq_num - rxq_per_numa * vecls_numa_num; + struct vecls_numa_info *numa_info; + int numa_start_cpu, numa_cpu_id; + int rxq_id = 0, nid, cpu; + + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + numa_start_cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); + for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) { + cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); + if (rxq_id >= vecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); + } + if (remain-- > 0) { + cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); + } + } +} + +static void config_affinity_strategy_custom(struct vecls_netdev_info *vecls_dev) +{ + vecls_debug("dev=%s\n", vecls_dev->dev_name); +} + +static void config_affinity_strategy(void) +{ + struct vecls_netdev_info *vecls_dev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + switch (strategy) { + case 1: + config_affinity_strategy_cluster(vecls_dev); + break; + case 2: + config_affinity_strategy_numa(vecls_dev); + break; + case 3: + config_affinity_strategy_custom(vecls_dev); + break; + case 0: + default: + config_affinity_strategy_default(vecls_dev); + break; + } + } +} + +static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu) +{ + int err = 0; + + err = irq_set_affinity(irq, get_cpu_mask(cpu)); + vecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err); +} + +static void enable_affinity_strategy(void) +{ + struct vecls_netdev_queue_info *rxq_info; + struct vecls_netdev_info *vecls_dev; + int rxq_id, devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { + rxq_info = &vecls_dev->rxq[rxq_id]; + irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu); + } + } +} + +static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id, + const struct cpumask *cpu_mask) +{ + int err = 0; + + err = netif_set_xps_queue(netdev, cpu_mask, rxq_id); + vecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id, + cpumask_pr_args(cpu_mask), err); +} + +static void set_netdev_xps_queue(bool enable) +{ + const struct cpumask clear_mask = { 0 }; + struct vecls_netdev_info *vecls_dev; + const struct cpumask *cpu_mask; + int rxq_id, devid, cpu, nid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { + cpu = vecls_dev->rxq[rxq_id].affinity_cpu; + nid = cpu_to_node(cpu); + if (enable) + cpu_mask = cpumask_of_node(nid); + else + cpu_mask = &clear_mask; + + netif_set_xps_queue_wrapper(vecls_dev->netdev, rxq_id, cpu_mask); + } + } +} + +static int __maybe_unused venetcls_status_seq_show(struct seq_file *seq, void *v) +{ + int err; + + if (mode == 0) + err = venetcls_ntuple_status(seq, v); + else + err = venetcls_flow_status(seq, v); + return err; +} + +static __init int vecls_init(void) +{ + struct vecls_numa_info *numa_info; + int nid, err; + + if (!check_params()) + return -EINVAL; + + err = init_vecls_numa_info(); + if (err) + return err; + + err = init_vecls_netdev_info(ifname); + if (err) + goto clean_numa; + + // Set irq affinity + config_affinity_strategy(); + enable_affinity_strategy(); + + // Calculate rxq bounded to one numa + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + err = init_numa_rxq_bitmap(nid, numa_info); + if (err) + goto clean_rxq; + } + +#ifdef CONFIG_XPS + set_netdev_xps_queue(true); +#endif + + if (mode == 0) + err = vecls_ntuple_res_init(); + else + err = vecls_flow_res_init(); + + if (err) + goto clean_rxq; + +#ifdef CONFIG_PROC_FS + if (!proc_create_net_single("venet_status", 0444, init_net.proc_net, + venetcls_status_seq_show, NULL)) { + err = -ENOMEM; + goto clean_rxq; + } +#endif + + return 0; + +clean_rxq: +clean_numa: + clean_vecls_netdev_info(); + clean_vecls_numa_info(); + return err; +} + +static __exit void vecls_exit(void) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("venet_status", init_net.proc_net); +#endif + if (mode == 0) + vecls_ntuple_res_clean(); + else + vecls_flow_res_clean(); + +#ifdef CONFIG_XPS + set_netdev_xps_queue(false); +#endif + + clean_vecls_rxq(); + clean_vecls_netdev_info(); + clean_vecls_numa_info(); +} + +module_init(vecls_init); +module_exit(vecls_exit); + +MODULE_DESCRIPTION("venetcls"); +MODULE_LICENSE("GPL"); diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c new file mode 100644 index 000000000000..ad3c10f8ae5f --- /dev/null +++ b/net/venetcls/venetcls_ntuple.c @@ -0,0 +1,713 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/inetdevice.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/inet.h> +#include <linux/jhash.h> +#include <linux/venetcls.h> +#include <net/addrconf.h> +#include <net/sock.h> + +#include "venetcls.h" + +struct vecls_sk_rule_list vecls_sk_rules, vecls_sk_list; +static struct workqueue_struct *do_cfg_workqueue; +static atomic_t vecls_worker_count = ATOMIC_INIT(0); + +static void init_vecls_sk_rules(void) +{ + unsigned int i; + + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) + INIT_HLIST_HEAD(vecls_sk_rules.hash + i); + mutex_init(&vecls_sk_rules.mutex); +} + +static inline u32 get_hash(struct cmd_context ctx) +{ + u32 hash; + + if (ctx.is_ipv6) + hash = jhash_2words(jhash(ctx.dip6, 16, 0), ctx.dport, 0); + else + hash = jhash_2words(ctx.dip4, ctx.dport, 0); + + return hash; +} + +static inline struct hlist_head *get_rule_hashlist(struct cmd_context ctx) +{ + u32 hash; + + hash = get_hash(ctx); + return vecls_sk_rules.hash + (hash & VECLS_SK_RULE_HASHMASK); +} + +static inline struct hlist_head *get_sk_hashlist(void *sk) +{ + return vecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & VECLS_SK_RULE_HASHMASK); +} + +static void add_sk_rule(int devid, struct cmd_context ctx, void *sk, int nid) +{ + struct hlist_head *hlist = get_rule_hashlist(ctx); + struct hlist_head *sk_hlist = get_sk_hashlist(sk); + struct vecls_sk_rule *rule; + struct vecls_sk_entry *entry; + + rule = kzalloc(sizeof(*rule), GFP_ATOMIC); + if (!rule) { + vecls_error("alloc rule failed\n"); + return; + } + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + vecls_error("alloc entry failed\n"); + kfree(rule); + return; + } + + rule->sk = sk; + rule->is_ipv6 = ctx.is_ipv6; + rule->dip4 = ctx.dip4; + memcpy(rule->dip6, ctx.dip6, sizeof(rule->dip6)); + rule->dport = ctx.dport; + rule->devid = devid; + rule->action = ctx.action; + rule->ruleid = ctx.ret_loc; + rule->nid = nid; + hlist_add_head(&rule->node, hlist); + + entry->sk = sk; + entry->sk_rule_hash = get_hash(ctx); + hlist_add_head(&entry->node, sk_hlist); +} + +static struct vecls_sk_entry *get_sk_entry(void *sk) +{ + struct hlist_head *sk_hlist = get_sk_hashlist(sk); + struct vecls_sk_entry *entry = NULL; + + hlist_for_each_entry(entry, sk_hlist, node) { + if (entry->sk == sk) + break; + } + return entry; +} + +static void del_sk_rule(struct vecls_sk_rule *rule) +{ + struct vecls_sk_entry *entry; + + entry = get_sk_entry(rule->sk); + if (!entry) + return; + hlist_del_init(&entry->node); + kfree(entry); + + vecls_debug("del rule=%p\n", rule); + hlist_del_init(&rule->node); + kfree(rule); +} + +static struct vecls_sk_rule *get_sk_rule(int devid, struct cmd_context ctx) +{ + struct hlist_head *hlist = get_rule_hashlist(ctx); + struct vecls_sk_rule *rule = NULL; + + hlist_for_each_entry(rule, hlist, node) { + if (rule->devid != devid || rule->dport != ctx.dport) + continue; + if (!rule->is_ipv6 && rule->dip4 == ctx.dip4) + break; + if (rule->is_ipv6 && !memcmp(rule->dip6, ctx.dip6, sizeof(rule->dip6))) + break; + } + return rule; +} + +static struct vecls_sk_rule *get_rule_from_sk(int devid, void *sk) +{ + struct vecls_sk_rule *rule = NULL; + struct vecls_sk_entry *entry; + struct hlist_head *hlist; + + entry = get_sk_entry(sk); + if (!entry) + return NULL; + + hlist = vecls_sk_rules.hash + (entry->sk_rule_hash & VECLS_SK_RULE_HASHMASK); + hlist_for_each_entry(rule, hlist, node) { + if (rule->devid == devid && rule->sk == sk) + break; + } + return rule; +} + +static inline bool reuseport_check(int devid, struct cmd_context ctx) +{ + return !!get_sk_rule(devid, ctx); +} + +static u32 get_first_ip4_addr(struct net *net) +{ + struct in_device *in_dev; + struct net_device *dev; + struct in_ifaddr *ifa; + u32 dip4 = 0; + + rtnl_lock(); + rcu_read_lock(); + for_each_netdev(net, dev) { + if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) + continue; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!strcmp(dev->name, ifa->ifa_label)) { + dip4 = ifa->ifa_local; + vecls_debug("dev:%s dip:%pI4\n", dev->name, &dip4); + goto out; + } + } + } +out: + rcu_read_unlock(); + rtnl_unlock(); + return dip4; +} + +static void get_first_ip6_addr(struct net *net, u32 *dip6) +{ + struct inet6_dev *idev; + struct net_device *dev; + struct inet6_ifaddr *ifp; + + rtnl_lock(); + rcu_read_lock(); + for_each_netdev(net, dev) { + if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) + continue; + idev = __in6_dev_get(dev); + if (!idev) + continue; + list_for_each_entry_rcu(ifp, &idev->addr_list, if_list) { + if (ifp->scope == RT_SCOPE_HOST) + continue; + if (ifp->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) + continue; + memcpy(dip6, &ifp->addr, sizeof(ifp->addr)); + vecls_debug("dev:%s dip:%pI6\n", dev->name, dip6); + goto out; + } + } +out: + rcu_read_unlock(); + rtnl_unlock(); +} + +static void get_sk_rule_addr(struct sock *sk, struct cfg_param *ctx_p) +{ + bool is_ipv6 = !!(sk->sk_family == AF_INET6); + u16 *dport = &ctx_p->ctx.dport; + u32 *dip4 = &ctx_p->ctx.dip4; + u32 *dip6 = &ctx_p->ctx.dip6[0]; + + *dport = htons(sk->sk_num); + ctx_p->ctx.is_ipv6 = is_ipv6; + + if (!match_ip_flag) { + *dip4 = 0; + memset(dip6, 0, sizeof(sk->sk_v6_rcv_saddr)); + return; + } + + if (is_ipv6) { + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + memcpy(dip6, &sk->sk_v6_rcv_saddr, sizeof(sk->sk_v6_rcv_saddr)); + else + get_first_ip6_addr(sock_net(sk), dip6); + + } else { + if (sk->sk_rcv_saddr) + *dip4 = sk->sk_rcv_saddr; + else + *dip4 = get_first_ip4_addr(sock_net(sk)); + } +} + +static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_SRXCLSRLDEL; + nfccmd.fs.location = loc; + err = send_ethtool_ioctl(ctx, &nfccmd); + if (err < 0) + vecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc); + return err; +} + +static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc) +{ + if (loc >= rmgr->size) { + vecls_error("rmgr: Location out of range\n"); + return -1; + } + + set_bit(loc, rmgr->slot); + return 0; +} + +static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp) +{ + __u32 loc, slot_num; + + if (rmgr->driver_select) + return 0; + + loc = rmgr->size - 1; + slot_num = loc / BITS_PER_LONG; + if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) { + loc -= 1 + (loc % BITS_PER_LONG); + slot_num--; + } + + while (loc < rmgr->size && !~(rmgr->slot[slot_num])) { + loc -= BITS_PER_LONG; + slot_num--; + } + + while (loc < rmgr->size && test_bit(loc, rmgr->slot)) + loc--; + + if (loc < rmgr->size) { + fsp->location = loc; + return rmgr_ins(rmgr, loc); + } + + return -1; +} + +static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_GRXCLSRLCNT; + nfccmd.data = 0; + err = send_ethtool_ioctl(ctx, &nfccmd); + *count = nfccmd.rule_cnt; + if (driver_select) + *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL); + if (err < 0) + vecls_debug("rxclass: Cannot get RX class rule count\n"); + + return err; +} + +static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr) +{ + struct ethtool_rxnfc *nfccmd; + __u32 *rule_locs; + int i, err = 0; + + memset(rmgr, 0, sizeof(*rmgr)); + err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select); + if (err < 0) + return err; + + if (rmgr->driver_select) + return err; + + nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC); + if (!nfccmd) { + vecls_error("rmgr: Cannot allocate memory for RX class rule locations\n"); + err = -ENOMEM; + goto out; + } + + nfccmd->cmd = ETHTOOL_GRXCLSRLALL; + nfccmd->rule_cnt = rmgr->n_rules; + err = send_ethtool_ioctl(ctx, nfccmd); + if (err < 0) { + vecls_debug("rmgr: Cannot get RX class rules\n"); + goto out; + } + + rmgr->size = nfccmd->data; + if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) { + vecls_error("rmgr: Invalid RX class rules table size\n"); + err = -EINVAL; + goto out; + } + + rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC); + if (!rmgr->slot) { + vecls_error("rmgr: Cannot allocate memory for RX class rules\n"); + err = -ENOMEM; + goto out; + } + + rule_locs = nfccmd->rule_locs; + for (i = 0; i < rmgr->n_rules; i++) { + err = rmgr_ins(rmgr, rule_locs[i]); + if (err < 0) + break; + } + +out: + kfree(nfccmd); + return err; +} + +static void rmgr_cleanup(struct rmgr_ctrl *rmgr) +{ + kfree(rmgr->slot); + rmgr->slot = NULL; + rmgr->size = 0; +} + +static int rmgr_set_location(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp) +{ + struct rmgr_ctrl rmgr; + int ret; + + ret = rmgr_init(ctx, &rmgr); + if (ret < 0) + goto out; + + ret = rmgr_find_empty_slot(&rmgr, fsp); +out: + rmgr_cleanup(&rmgr); + return ret; +} + +static int rxclass_rule_ins(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp, u32 rss_context) +{ + struct ethtool_rxnfc nfccmd; + u32 loc = fsp->location; + int ret; + + if (loc & RX_CLS_LOC_SPECIAL) { + ret = rmgr_set_location(ctx, fsp); + if (ret < 0) + return ret; + } + + nfccmd.cmd = ETHTOOL_SRXCLSRLINS; + nfccmd.rss_context = rss_context; + nfccmd.fs = *fsp; + ret = send_ethtool_ioctl(ctx, &nfccmd); + if (ret < 0) { + vecls_debug("Can not insert the clasification rule\n"); + return ret; + } + + if (loc & RX_CLS_LOC_SPECIAL) + vecls_debug("Added rule with ID %d\n", nfccmd.fs.location); + + return 0; +} + +static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del) +{ + struct ethtool_rx_flow_spec *fsp, rx_rule_fs; + u32 rss_context = 0; + bool is_ipv6 = ctx->is_ipv6; + int ret, i; + + if (ctx->is_ipv6) + vecls_debug("del:%d dev:%s dip:%pI6 dport:%d action:%d ruleid:%u del_ruleid:%u\n", + is_del, ctx->netdev, &ctx->dip6, ntohs(ctx->dport), ctx->action, + ctx->ruleid, ctx->del_ruleid); + else + vecls_debug("del:%d dev:%s dip:%pI4 dport:%d action:%d ruleid:%u del_ruleid:%u\n", + is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, + ctx->ruleid, ctx->del_ruleid); + + if (is_del) + return rxclass_rule_del(ctx, ctx->del_ruleid); + + ctx->ret_loc = -1; + + fsp = &rx_rule_fs; + memset(fsp, 0, sizeof(*fsp)); + if (is_ipv6) { + fsp->flow_type = TCP_V6_FLOW; + memcpy(fsp->h_u.tcp_ip6_spec.ip6dst, ctx->dip6, sizeof(ctx->dip6)); + fsp->h_u.tcp_ip6_spec.pdst = ctx->dport; + fsp->m_u.tcp_ip6_spec.pdst = (u16)~0ULL; + if (ctx->dip6[0] | ctx->dip6[1] | ctx->dip6[2] | ctx->dip6[3]) { + for (i = 0; i < 4; i++) + fsp->m_u.tcp_ip6_spec.ip6dst[i] = (u32)~0ULL; + } + } else { + fsp->flow_type = TCP_V4_FLOW; + fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4; + fsp->h_u.tcp_ip4_spec.pdst = ctx->dport; + fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL; + if (ctx->dip4) + fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL; + } + fsp->location = RX_CLS_LOC_ANY; + if (ctx->ruleid) + fsp->location = ctx->ruleid; + fsp->ring_cookie = ctx->action; + + ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context); + if (!ret) + ctx->ret_loc = rx_rule_fs.location; + return ret; +} + +static void cfg_work(struct work_struct *work) +{ + struct cfg_param *ctx_p = container_of(work, struct cfg_param, work); + struct vecls_netdev_info *vecls_dev; + struct vecls_sk_rule *rule; + int devid, rxq_id, err; + + mutex_lock(&vecls_sk_rules.mutex); + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + strscpy(ctx_p->ctx.netdev, vecls_dev->dev_name, IFNAMSIZ); + if (!(ctx_p->is_del)) { + if (reuseport_check(devid, ctx_p->ctx)) { + if (ctx_p->ctx.is_ipv6) + vecls_debug("dip:%pI6, dport:%d reuse!\n", + &ctx_p->ctx.dip6, ntohs(ctx_p->ctx.dport)); + else + vecls_debug("dip:%pI4, dport:%d reuse!\n", + &ctx_p->ctx.dip4, ntohs(ctx_p->ctx.dport)); + continue; + } + + // Calculate the bound queue + rxq_id = alloc_rxq_id(ctx_p->nid, ctx_p->cpu, devid); + if (rxq_id < 0) + continue; + + // Config Ntuple rule to dev + ctx_p->ctx.action = (u16)rxq_id; + err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); + if (err) { + vecls_debug("Add sk:%p, dev_id:%d, rxq:%d, err:%d\n", + ctx_p->sk, devid, rxq_id, err); + free_rxq_id(ctx_p->nid, devid, rxq_id); + continue; + } + add_sk_rule(devid, ctx_p->ctx, ctx_p->sk, ctx_p->nid); + } else { + rule = get_rule_from_sk(devid, ctx_p->sk); + if (!rule) { + vecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", + ctx_p->sk, devid, &ctx_p->ctx.dip4, + ntohs(ctx_p->ctx.dport)); + continue; + } + + // Config Ntuple rule to dev + ctx_p->ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); + // Free the bound queue + free_rxq_id(rule->nid, devid, rule->action); + // Delete sk rule + del_sk_rule(rule); + } + } + mutex_unlock(&vecls_sk_rules.mutex); + kfree(ctx_p); + atomic_dec(&vecls_worker_count); +} + +static bool has_sock_rule(struct sock *sk) +{ + struct vecls_netdev_info *vecls_dev; + struct vecls_sk_rule *rule; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + rule = get_rule_from_sk(devid, sk); + if (rule) + return true; + } + return false; +} + +static void del_ntuple_rule(struct sock *sk) +{ + struct cfg_param *ctx_p; + + if (!has_sock_rule(sk)) + return; + + ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); + if (!ctx_p) + return; + get_sk_rule_addr(sk, ctx_p); + + ctx_p->is_del = true; + ctx_p->sk = sk; + INIT_WORK(&ctx_p->work, cfg_work); + queue_work(do_cfg_workqueue, &ctx_p->work); + atomic_inc(&vecls_worker_count); +} + +static void add_ntuple_rule(struct sock *sk) +{ + struct cfg_param *ctx_p; + int cpu = raw_smp_processor_id(); + int nid = cpu_to_node(cpu); + + if (check_appname(current->comm)) + return; + + ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); + if (!ctx_p) + return; + get_sk_rule_addr(sk, ctx_p); + + ctx_p->is_del = false; + ctx_p->sk = sk; + ctx_p->nid = nid; + ctx_p->cpu = cpu; + INIT_WORK(&ctx_p->work, cfg_work); + queue_work(do_cfg_workqueue, &ctx_p->work); + atomic_inc(&vecls_worker_count); +} + +static void ethtool_cfg_rxcls(struct sock *sk, int is_del) +{ + bool is_ipv6; + + if (sk->sk_state != TCP_LISTEN) + return; + + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return; + + is_ipv6 = !!(sk->sk_family == AF_INET6); + if (is_ipv6) + vecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, IPv6:%pI6, port:%d\n", + raw_smp_processor_id(), current->comm, sk, is_del, + &sk->sk_v6_rcv_saddr, (u16)sk->sk_num); + else + vecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, IPv4:%pI4, port:%d\n", + raw_smp_processor_id(), current->comm, sk, is_del, + &sk->sk_rcv_saddr, (u16)sk->sk_num); + + if (is_del) + del_ntuple_rule(sk); + else + add_ntuple_rule(sk); +} + +static void clean_vecls_sk_rules(void) +{ + struct vecls_netdev_info *vecls_dev; + struct cmd_context ctx = { 0 }; + struct vecls_sk_rule *rule; + struct hlist_head *hlist; + struct hlist_node *n; + unsigned int i; + int err; + + mutex_lock(&vecls_sk_rules.mutex); + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { + hlist = &vecls_sk_rules.hash[i]; + + hlist_for_each_entry_safe(rule, n, hlist, node) { + vecls_dev = get_vecls_netdev_info(rule->devid); + if (!vecls_dev) + continue; + strscpy(ctx.netdev, vecls_dev->dev_name, IFNAMSIZ); + ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx, true); + vecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, + rule->devid, rule->action, rule->ruleid, err); + + hlist_del(&rule->node); + vecls_debug("clean rule=%p\n", rule); + kfree(rule); + } + } + mutex_unlock(&vecls_sk_rules.mutex); +} + +int venetcls_ntuple_status(struct seq_file *seq, void *v) +{ + struct vecls_netdev_info *vecls_dev; + struct vecls_sk_rule *rule; + struct hlist_head *hlist; + struct hlist_node *n; + unsigned int i; + + seq_printf(seq, "%-16s %-42s %-8s %-6s %-6s %-6s\n", + "Interface", "dstIP", "dstPort", "rxq", "ruleId", "NumaID"); + mutex_lock(&vecls_sk_rules.mutex); + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { + hlist = &vecls_sk_rules.hash[i]; + hlist_for_each_entry_safe(rule, n, hlist, node) { + vecls_dev = get_vecls_netdev_info(rule->devid); + if (!vecls_dev) + continue; + if (rule->is_ipv6) + seq_printf(seq, "%-16s %-42pI6 %-8d %-6d %-6d %-6d\n", + vecls_dev->dev_name, &rule->dip6, ntohs(rule->dport), + rule->action, rule->ruleid, rule->nid); + else + seq_printf(seq, "%-16s %-42pI4 %-8d %-6d %-6d %-6d\n", + vecls_dev->dev_name, &rule->dip4, ntohs(rule->dport), + rule->action, rule->ruleid, rule->nid); + } + } + mutex_unlock(&vecls_sk_rules.mutex); + + return 0; +} + +static const struct vecls_hook_ops vecls_ntuple_ops = { + .vecls_flow_update = NULL, + .vecls_set_cpu = NULL, + .vecls_timeout = NULL, + .vecls_cfg_rxcls = ethtool_cfg_rxcls, +}; + +int vecls_ntuple_res_init(void) +{ + do_cfg_workqueue = alloc_ordered_workqueue("vecls_cfg", 0); + if (!do_cfg_workqueue) { + vecls_debug("alloc_ordered_workqueue fails\n"); + return -ENOMEM; + } + + init_vecls_sk_rules(); + RCU_INIT_POINTER(vecls_ops, &vecls_ntuple_ops); + synchronize_rcu(); + return 0; +} + +void vecls_ntuple_res_clean(void) +{ + RCU_INIT_POINTER(vecls_ops, NULL); + synchronize_rcu(); + + while (atomic_read(&vecls_worker_count) != 0) + mdelay(1); + destroy_workqueue(do_cfg_workqueue); + clean_vecls_sk_rules(); +} -- 2.34.1