[PATCH OLK-6.6 00/15] Backport venetcls for velinux
Li Xiasong (7): net/venetcls: free sk entry hash on ntuple cleanup net/venetcls: fix RPS CPU index scaling bias net/venetcls: fix mode-0 init rollback for ntuple resources net/venetcls: validate module params and reject zero table sizes net/venetcls: move listen-time tuple resolution to worker context net/venetcls: fix async delete race with pending rule add net/venetcls: make rps policy module params read-only Yue Haibing (8): net/venetcls: introduce venetcls for network optimization net/venetcls: Fix possible hash collision issue net/venetcls: Make VENETCLS default as module net/venetcls: Add local flow NUMA-aware rps net/venetcls: Add rps policy switch for phy NIC net/venetcls: Fix flow table init for rps_policy net/venetcls: Limit rxqs against combined_channels net/venetcls: Make VENETCLS default as module MAINTAINERS | 5 + include/linux/netdevice.h | 3 + include/linux/skbuff.h | 3 + include/linux/venetcls.h | 130 ++++ include/net/sock.h | 3 + kernel/irq/irqdesc.c | 2 +- net/Kconfig | 1 + net/Makefile | 1 + net/core/dev.c | 37 + net/core/sock.c | 3 + net/ipv4/af_inet.c | 6 + net/ipv4/tcp.c | 12 + net/venetcls/Kconfig | 11 + net/venetcls/Makefile | 7 + net/venetcls/asmdefs.S | 61 ++ net/venetcls/memcpy-sve.S | 157 ++++ net/venetcls/venetcls.h | 199 ++++++ net/venetcls/venetcls_flow.c | 638 +++++++++++++++++ net/venetcls/venetcls_main.c | 1220 ++++++++++++++++++++++++++++++++ net/venetcls/venetcls_ntuple.c | 744 +++++++++++++++++++ 20 files changed, 3242 insertions(+), 1 deletion(-) create mode 100644 include/linux/venetcls.h create mode 100644 net/venetcls/Kconfig create mode 100644 net/venetcls/Makefile create mode 100644 net/venetcls/asmdefs.S create mode 100644 net/venetcls/memcpy-sve.S create mode 100644 net/venetcls/venetcls.h create mode 100644 net/venetcls/venetcls_flow.c create mode 100644 net/venetcls/venetcls_main.c create mode 100644 net/venetcls/venetcls_ntuple.c -- 2.34.1
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/2XY... 失败原因:应用补丁/补丁集失败,Patch failed at 0001 net/venetcls: introduce venetcls for network optimization 建议解决方法:请查看失败原因, 确认补丁是否可以应用在当前期望分支的最新代码上 FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/2XY... Failed Reason: apply patch(es) failed, Patch failed at 0001 net/venetcls: introduce venetcls for network optimization Suggest Solution: please checkout if the failed patch(es) can work on the newest codes in expected branch
hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- This introduces a kind of network optimization method named venetcls. It can configure the ntuple rule, and bind interrupt to the netdev queue automatically. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> Signed-off-by: Wang Liang <wangliang74@huawei.com> Signed-off-by: Liu Jian <liujian56@huawei.com> Signed-off-by: yuelg <yuelg@chinaunicom.cn> --- MAINTAINERS | 5 + include/linux/netdevice.h | 3 + include/linux/venetcls.h | 101 +++ kernel/irq/irqdesc.c | 2 +- net/Kconfig | 1 + net/Makefile | 1 + net/core/dev.c | 23 + net/ipv4/af_inet.c | 6 + net/ipv4/tcp.c | 9 + net/venetcls/Kconfig | 11 + net/venetcls/Makefile | 7 + net/venetcls/asmdefs.S | 61 ++ net/venetcls/memcpy-sve.S | 157 +++++ net/venetcls/venetcls.h | 183 +++++ net/venetcls/venetcls_flow.c | 514 ++++++++++++++ net/venetcls/venetcls_main.c | 1154 ++++++++++++++++++++++++++++++++ net/venetcls/venetcls_ntuple.c | 713 ++++++++++++++++++++ 17 files changed, 2950 insertions(+), 1 deletion(-) create mode 100644 include/linux/venetcls.h create mode 100644 net/venetcls/Kconfig create mode 100644 net/venetcls/Makefile create mode 100644 net/venetcls/asmdefs.S create mode 100644 net/venetcls/memcpy-sve.S create mode 100644 net/venetcls/venetcls.h create mode 100644 net/venetcls/venetcls_flow.c create mode 100644 net/venetcls/venetcls_main.c create mode 100644 net/venetcls/venetcls_ntuple.c diff --git a/MAINTAINERS b/MAINTAINERS index ab1ff9b4195e..861b3418b947 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20568,6 +20568,11 @@ F: net/xdp/ F: samples/bpf/xdpsock* F: tools/lib/bpf/xsk* +VENETCLS +M: Yue Haibing <yuehaibing@huawei.com> +F: include/linux/venetcls.h +F: net/venetcls/ + XEN BLOCK SUBSYSTEM M: Roger Pau Monné <roger.pau@citrix.com> L: xen-devel@lists.xenproject.org (moderated for non-subscribers) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc1f14f3c236..e5f876cecf15 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -766,6 +766,9 @@ struct netdev_rx_queue { struct xsk_buff_pool *pool; #endif struct file __rcu *dmabuf_pages; +#if IS_ENABLED(CONFIG_VENETCLS) + void __rcu *vecls_ftb; +#endif } ____cacheline_aligned_in_smp; struct page * diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h new file mode 100644 index 000000000000..9cfcdd4e5766 --- /dev/null +++ b/include/linux/venetcls.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_VENETCLS_H +#define _LINUX_VENETCLS_H + +struct vecls_hook_ops { + void (*vecls_cfg_rxcls)(struct sock *sk, int is_del); + void (*vecls_flow_update)(struct sock *sk); + void (*vecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail); + bool (*vecls_timeout)(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id); +}; + +typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); +extern const struct vecls_hook_ops __rcu *vecls_ops; + +static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) +{ + const struct vecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_cfg_rxcls) + ops->vecls_cfg_rxcls(sk, is_del); + rcu_read_unlock(); +} + +static inline void venetcls_flow_update(struct sock *sk) +{ + const struct vecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_flow_update) + ops->vecls_flow_update(sk); + rcu_read_unlock(); +} + +static inline bool +venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) +{ + const struct vecls_hook_ops *ops; + int cpu, last_qtail; + bool result = false; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_set_cpu) { + cpu = -1; + last_qtail = 0; + ops->vecls_set_cpu(skb, &cpu, &last_qtail); + if (cpu >= 0) { + *ret = enq_func(skb, cpu, &last_qtail); + result = true; + } + } + rcu_read_unlock(); + return result; +} + +static inline void +venetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func) +{ + const struct vecls_hook_ops *ops; + struct sk_buff *skb, *next; + int cpu, last_qtail; + + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_set_cpu) { + list_for_each_entry_safe(skb, next, head, list) { + cpu = -1; + last_qtail = 0; + ops->vecls_set_cpu(skb, &cpu, &last_qtail); + if (cpu >= 0) { + skb_list_del_init(skb); + enq_func(skb, cpu, &last_qtail); + } + } + } + rcu_read_unlock(); +} + +static inline bool venetcls_may_expire_flow(struct net_device *dev, + u16 rxq_index, u32 flow_id, + u16 filter_id, bool *expire) +{ + const struct vecls_hook_ops *ops; + bool ret = false; + + *expire = true; + rcu_read_lock(); + ops = rcu_dereference(vecls_ops); + if (ops && ops->vecls_timeout) { + *expire = ops->vecls_timeout(dev, rxq_index, flow_id, filter_id); + ret = true; + } + rcu_read_unlock(); + return ret; +} + +#endif /* _LINUX_VENETCLS_H */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8202d4a996a5..eb8641e22575 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -366,7 +366,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } -#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE +#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_VENETCLS) EXPORT_SYMBOL_GPL(irq_to_desc); #endif diff --git a/net/Kconfig b/net/Kconfig index dc8451e75e4c..2b68c0f8625e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -72,6 +72,7 @@ source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" source "net/xdp/Kconfig" +source "net/venetcls/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/Makefile b/net/Makefile index 6a62e5b27378..a2cb1281e2a9 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_MCTP) += mctp/ +obj-$(CONFIG_VENETCLS) += venetcls/ diff --git a/net/core/dev.c b/net/core/dev.c index f628494a1c0f..47b916ca8d46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -160,6 +160,12 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) +#if IS_ENABLED(CONFIG_VENETCLS) +#include <linux/venetcls.h> +const struct vecls_hook_ops __rcu *vecls_ops __read_mostly; +EXPORT_SYMBOL_GPL(vecls_ops); +#endif + static DEFINE_SPINLOCK(ptype_lock); static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; @@ -4770,6 +4776,10 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, bool expire = true; unsigned int cpu; +#if IS_ENABLED(CONFIG_VENETCLS) + if (venetcls_may_expire_flow(dev, rxq_index, flow_id, filter_id, &expire)) + return expire; +#endif rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { @@ -5881,6 +5891,12 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } } +#endif +#if IS_ENABLED(CONFIG_VENETCLS) + if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { + rcu_read_unlock(); + return ret; + } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); @@ -5915,6 +5931,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) } } } +#endif +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_skblist_set_cpu(head, enqueue_to_backlog); #endif __netif_receive_skb_list(head); rcu_read_unlock(); @@ -10272,6 +10291,10 @@ int __netdev_update_features(struct net_device *dev) return err < 0 ? 0 : 1; } +#if IS_ENABLED(CONFIG_VENETCLS) +EXPORT_SYMBOL(__netdev_update_features); +#endif + static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5dc1955e38c4..06b917182a5a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -120,6 +120,9 @@ #include <net/compat.h> #include <trace/events/sock.h> +#if IS_ENABLED(CONFIG_VENETCLS) +#include <linux/venetcls.h> +#endif /* The inetsw table contains everything that inet_create needs to * build a new socket. @@ -229,6 +232,9 @@ int inet_listen(struct socket *sock, int backlog) if (err) goto out; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_cfg_rxcls(sk, 0); +#endif } err = 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e8b7f0c5dded..cc84873cee0d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -281,6 +281,9 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> #include <linux/dma-buf.h> +#if IS_ENABLED(CONFIG_VENETCLS) +#include <linux/venetcls.h> +#endif /* Track pending CMSGs. */ enum { @@ -2940,6 +2943,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_flow_update(sk); +#endif if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) @@ -3300,6 +3306,9 @@ void __tcp_close(struct sock *sk, long timeout) void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_cfg_rxcls(sk, 1); +#endif __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); diff --git a/net/venetcls/Kconfig b/net/venetcls/Kconfig new file mode 100644 index 000000000000..7f2ea5c4a6b0 --- /dev/null +++ b/net/venetcls/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +config VENETCLS + tristate "Network classification" + depends on MODULES + default n + help + This introduces a kind of network optimization method, which can + configure the flow steer rules, and bind interrupt to the netdev + queue automatically. + + This module can only be built as a loadable module. diff --git a/net/venetcls/Makefile b/net/venetcls/Makefile new file mode 100644 index 000000000000..639a81d7d6b2 --- /dev/null +++ b/net/venetcls/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_VENETCLS) = venetcls.o +venetcls-y := venetcls_main.o venetcls_ntuple.o venetcls_flow.o +ifeq ($(CONFIG_ARM64_SVE),y) +venetcls-y += memcpy-sve.o +endif diff --git a/net/venetcls/asmdefs.S b/net/venetcls/asmdefs.S new file mode 100644 index 000000000000..8138a94c18af --- /dev/null +++ b/net/venetcls/asmdefs.S @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name, %function; \ + .align alignment; \ +name: \ + .cfi_startproc; \ + BTI_C; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name, %function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#endif diff --git a/net/venetcls/memcpy-sve.S b/net/venetcls/memcpy-sve.S new file mode 100644 index 000000000000..0452ff8b3afb --- /dev/null +++ b/net/venetcls/memcpy-sve.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#include "asmdefs.S" + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h new file mode 100644 index 000000000000..14f02cd962c3 --- /dev/null +++ b/net/venetcls/venetcls.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _NET_VENETCLS_H +#define _NET_VENETCLS_H +#include <linux/if.h> +#include <linux/mutex.h> +#include <linux/cpufeature.h> + +#define VECLS_MAX_NETDEV_NUM 8 +#define VECLS_MAX_RXQ_NUM_PER_DEV 256 +#define VECLS_MAX_CPU_NUM 1024 + +#define VECLS_TIMEOUT (5 * HZ) +#define VECLS_NO_FILTER 0xffff +#define VECLS_NO_CPU 0xffff + +#define RXQ_MAX_USECNT 0xFF + +struct vecls_netdev_queue_info { + int irq; + int affinity_cpu; +}; + +struct vecls_netdev_info { + char dev_name[IFNAMSIZ]; + struct net_device *netdev; + int rxq_num; + struct vecls_netdev_queue_info rxq[VECLS_MAX_RXQ_NUM_PER_DEV]; + int old_filter_state; +}; + +struct vecls_rxq { + int rxq_id; + int status; +}; + +struct vecls_numa_clusterinfo { + int cluster_id; + int cur_freeidx; + struct vecls_rxq rxqs[VECLS_MAX_RXQ_NUM_PER_DEV]; +}; + +struct vecls_numa_bound_dev_info { + unsigned char bitmap_rxq[VECLS_MAX_RXQ_NUM_PER_DEV]; + struct vecls_numa_clusterinfo *cluster_info; +}; + +struct vecls_numa_info { + DECLARE_BITMAP(avail_cpus, VECLS_MAX_CPU_NUM); + struct vecls_numa_bound_dev_info bound_dev[VECLS_MAX_NETDEV_NUM]; +}; + +struct cmd_context { + char netdev[IFNAMSIZ]; + bool is_ipv6; + u32 dip4; + u32 dip6[4]; + u16 dport; + u16 action; + u32 ruleid; + u32 del_ruleid; + int ret_loc; +}; + +#define VECLS_SK_RULE_HASHSIZE 256 +#define VECLS_SK_RULE_HASHMASK (VECLS_SK_RULE_HASHSIZE - 1) + +struct vecls_sk_rule_list { + struct hlist_head hash[VECLS_SK_RULE_HASHSIZE]; + /* Mutex to synchronize access to ntuple rule locking */ + struct mutex mutex; +}; + +struct vecls_sk_rule { + struct hlist_node node; + int devid; + void *sk; + bool is_ipv6; + u32 dip4; + u32 dip6[4]; + u16 dport; + int action; + int ruleid; + int nid; +}; + +struct vecls_sk_entry { + struct hlist_node node; + void *sk; + u32 sk_rule_hash; +}; + +struct vecls_dev_flow { + unsigned short cpu; + unsigned short filter; + unsigned long timeout; + int isvalid; +}; + +struct vecls_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct vecls_dev_flow flows[]; +}; + +struct vecls_sock_flow_table { + u32 mask; + u32 ents[] ____cacheline_aligned_in_smp; +}; + +#define VECLS_DEV_FLOW_TABLE_NUM 0x1000 +#define VECLS_SOCK_FLOW_TABLE_NUM 0x100000 +#define VECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct vecls_dev_flow_table) + \ + ((_num) * sizeof(struct vecls_dev_flow))) +#define VECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct vecls_sock_flow_table, ents[_num])) + +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ + NETIF_F_RXHASH) + +struct rmgr_ctrl { + int driver_select; + unsigned long *slot; + __u32 n_rules; + __u32 size; +}; + +struct cfg_param { + struct work_struct work; + struct cmd_context ctx; + struct sock *sk; + bool is_del; + int nid; + int cpu; +}; + +extern int match_ip_flag; +extern int debug; +extern int vecls_netdev_num; +extern int vecls_numa_num; + +#define vecls_debug(fmt, ...) \ + do { \ + if (debug) \ + pr_info_ratelimited("venetcls [%s:%d]: " fmt,\ + __FILE__, __LINE__, ## __VA_ARGS__); \ + } while (0) + +#define vecls_error(fmt, ...) \ + pr_err_ratelimited("venetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__) + +struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index); +struct vecls_numa_info *get_vecls_numa_info(unsigned int nid); + +#ifdef CONFIG_ARM64_SVE +void *__memcpy_aarch64_sve(void *, const void *, size_t); +#define memcpy_r(dst, src, len) \ + do { \ + void *_dst = dst; \ + const void *_src = src; \ + size_t _len = len; \ + if (system_supports_sve()) \ + __memcpy_aarch64_sve(_dst, _src, _len); \ + else \ + memcpy(_dst, _src, _len); \ + } while (0) +#else +#define memcpy_r(dst, src, len) memcpy(dst, src, len) +#endif + +int check_appname(char *task_name); +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd); +int alloc_rxq_id(int nid, int cpu, int devid); +void free_rxq_id(int nid, int devid, int rxq_id); +int vecls_ntuple_res_init(void); +void vecls_ntuple_res_clean(void); +int venetcls_ntuple_status(struct seq_file *seq, void *v); +int vecls_flow_res_init(void); +void vecls_flow_res_clean(void); +int venetcls_flow_status(struct seq_file *seq, void *v); + +#endif /* _NET_VENETCLS_H */ diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c new file mode 100644 index 000000000000..242254b9bfe0 --- /dev/null +++ b/net/venetcls/venetcls_flow.c @@ -0,0 +1,514 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/inetdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/inet.h> +#include <linux/venetcls.h> +#include <net/sock.h> + +#include "venetcls.h" + +static u32 vecls_cpu_mask; +static struct vecls_sock_flow_table __rcu *vecls_sock_flow_table; +static DEFINE_MUTEX(vecls_sock_flow_mutex); +static DEFINE_SPINLOCK(vecls_dev_flow_lock); + +bool is_vecls_config_netdev(const char *name) +{ + struct vecls_netdev_info *vecls_dev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + if (strcmp(vecls_dev->dev_name, name) == 0) + return true; + } + + return false; +} + +static bool _vecls_timeout(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct vecls_dev_flow_table *flow_table; + struct vecls_dev_flow *rflow; + bool expire = true; + unsigned int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->vecls_ftb); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = READ_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu < nr_cpu_ids) { + if (time_before(jiffies, rflow->timeout + VECLS_TIMEOUT)) { + expire = false; + } else { + rflow->isvalid = 0; + WRITE_ONCE(rflow->cpu, VECLS_NO_CPU); + } + } + } + rcu_read_unlock(); + if (expire) + vecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__, + dev->name, rxq_index, flow_id, filter_id, expire); + return expire; +} + +static void _vecls_flow_update(struct sock *sk) +{ + struct vecls_sock_flow_table *tb; + unsigned int hash, index; + u32 val; + u32 cpu = raw_smp_processor_id(); + + if (sk->sk_state != TCP_ESTABLISHED) + return; + + if (check_appname(current->comm)) + return; + + rcu_read_lock(); + tb = rcu_dereference(vecls_sock_flow_table); + hash = READ_ONCE(sk->sk_rxhash); + if (tb && hash) { + index = hash & tb->mask; + val = hash & ~vecls_cpu_mask; + val |= cpu; + + if (READ_ONCE(tb->ents[index]) != val) + WRITE_ONCE(tb->ents[index], val); + } + rcu_read_unlock(); +} + +static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb) +{ + struct vecls_numa_bound_dev_info *bound_dev = NULL; + struct vecls_netdev_info *vecls_dev; + struct vecls_numa_info *numa_info; + int i, devid, rxq_num, rxq_id; + u32 hash, index; + + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + return -1; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + if (strcmp(vecls_dev->dev_name, dev->name) == 0) { + bound_dev = &numa_info->bound_dev[devid]; + break; + } + } + if (!bound_dev) + return -1; + + rxq_num = 0; + for (i = 0; i < VECLS_MAX_RXQ_NUM_PER_DEV; i++) { + if (bound_dev->bitmap_rxq[i] == RXQ_MAX_USECNT) + continue; + rxq_num++; + } + if (rxq_num == 0) + return -1; + hash = skb_get_hash(skb); + index = hash % rxq_num; + + i = 0; + for (rxq_id = 0; rxq_id < VECLS_MAX_RXQ_NUM_PER_DEV; rxq_id++) { + if (bound_dev->bitmap_rxq[rxq_id] == RXQ_MAX_USECNT) + continue; + if (i++ == index) + return rxq_id; + } + + vecls_debug("%s skb:%p, no found rxq\n", __func__, skb); + return -1; +} + +static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, + struct vecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu) +{ + struct netdev_rx_queue *rxqueue; + struct vecls_dev_flow_table *dtb; + struct vecls_dev_flow *rflow; + u32 flow_id, hash; + int rxq_index, rc; + + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + return; + + rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); + if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) { + vecls_debug("%s skb:%p, old_rxq:%d, next_cpu:%d new_rxq:%d\n", + __func__, skb, old_rxq_id, next_cpu, rxq_index); + return; + } + + rxqueue = dev->_rx + rxq_index; + dtb = rcu_dereference(rxqueue->vecls_ftb); + if (!dtb) + return; + + hash = skb_get_hash(skb); + flow_id = hash & dtb->mask; + rflow = &dtb->flows[flow_id]; + + if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(next_cpu)) { + rflow->timeout = jiffies; + return; + } + + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); + if (rc < 0) { + vecls_debug("skb:%p rxq:%d hash:0x%x flow_id:%u old_rxq:%d rflow->cpu:%d rflow->isvalid:%d next_cpu:%d rc:%d\n", + skb, rxq_index, hash, flow_id, old_rxq_id, rflow->cpu, + rflow->isvalid, next_cpu, rc); + return; + } + + rflow->filter = rc; + rflow->isvalid = 1; + rflow->timeout = jiffies; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = VECLS_NO_FILTER; + rflow->cpu = next_cpu; +} + +static int get_cpu_in_numa(int tcpu, u32 hash) +{ + const struct cpumask *mask; + int nr_cpus, cpu, index; + + mask = cpumask_of_node(cpu_to_node(tcpu)); + nr_cpus = cpumask_weight(mask); + if (nr_cpus == 0) + return -1; + + index = reciprocal_scale(hash, nr_cpus); + if (index < 0) + return -1; + + cpu = cpumask_first(mask); + while (--nr_cpus > 0) { + if (index == 0) + break; + cpu = cpumask_next(cpu, mask); + index--; + } + return cpu; +} + +static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, + struct vecls_sock_flow_table *tb, struct vecls_dev_flow_table *dtb, + int old_rxq_id, int *rcpu, int *last_qtail) +{ + u32 last_recv_cpu, hash, val, cpu, tcpu; + struct vecls_dev_flow *rflow; + int newcpu; + + cpu = raw_smp_processor_id(); + skb_reset_network_header(skb); + hash = skb_get_hash(skb); + if (!hash) + return; + + val = READ_ONCE(tb->ents[hash & tb->mask]); + last_recv_cpu = val & vecls_cpu_mask; + rflow = &dtb->flows[hash & dtb->mask]; + tcpu = rflow->cpu; + + if ((val ^ hash) & ~vecls_cpu_mask) + return; + + newcpu = get_cpu_in_numa(last_recv_cpu, hash); + if (newcpu >= 0) + *rcpu = newcpu; + else + newcpu = last_recv_cpu; + + if (cpu_to_node(cpu) == cpu_to_node(newcpu)) + return; + + if (tcpu >= nr_cpu_ids) + set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); +} + +static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) +{ + struct net_device *ndev = skb->dev; + struct vecls_sock_flow_table *stb; + struct vecls_dev_flow_table *dtb; + struct netdev_rx_queue *rxqueue; + int rxq_id = -1; + + *cpu = -1; + last_qtail = 0;//unused + if (!ndev) + return; + + if (!is_vecls_config_netdev(ndev->name)) + return; + + rxqueue = ndev->_rx; + if (skb_rx_queue_recorded(skb)) { + rxq_id = skb_get_rx_queue(skb); + if (rxq_id >= ndev->real_num_rx_queues) { + vecls_debug("%s ndev:%s rxq:%d real_num:%d\n", __func__, + ndev->name, rxq_id, ndev->real_num_rx_queues); + return; + } + rxqueue += rxq_id; + } + + if (rxq_id < 0) + return; + + rcu_read_lock(); + stb = rcu_dereference(vecls_sock_flow_table); + dtb = rcu_dereference(rxqueue->vecls_ftb); + if (stb && dtb) + __vecls_set_cpu(skb, ndev, stb, dtb, rxq_id, cpu, last_qtail); + rcu_read_unlock(); +} + +static void vecls_dev_flow_table_free(struct rcu_head *rcu) +{ + struct vecls_dev_flow_table *table = container_of(rcu, + struct vecls_dev_flow_table, rcu); + vfree(table); +} + +static void vecls_dev_flow_table_cleanup(struct net_device *netdev, int queues) +{ + struct vecls_dev_flow_table *dtb; + struct netdev_rx_queue *queue; + int i; + + for (i = 0; i < queues; i++) { + queue = netdev->_rx + i; + spin_lock(&vecls_dev_flow_lock); + dtb = rcu_dereference_protected(queue->vecls_ftb, + lockdep_is_held(&vecls_dev_flow_lock)); + rcu_assign_pointer(queue->vecls_ftb, NULL); + spin_unlock(&vecls_dev_flow_lock); + if (dtb) + call_rcu(&dtb->rcu, vecls_dev_flow_table_free); + } +} + +static int vecls_dev_flow_table_release(void) +{ + struct vecls_netdev_info *vecls_dev; + struct net_device *netdev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + netdev = vecls_dev->netdev; + if (!netdev) + continue; + vecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues); + } + + return 0; +} + +static int _vecls_dev_flow_table_init(struct net_device *netdev) +{ + struct vecls_dev_flow_table *table; + int size = VECLS_DEV_FLOW_TABLE_NUM; + struct netdev_rx_queue *queue; + int i, j, ret = 0; + + size = roundup_pow_of_two(size); + vecls_debug("%s dev:%s num_rx_queues:%d mask:0x%x\n", + __func__, netdev->name, netdev->num_rx_queues, size - 1); + + for (i = 0; i < netdev->num_rx_queues; i++) { + table = vmalloc(VECLS_DEV_FLOW_TABLE_SIZE(size)); + if (!table) { + ret = -ENOMEM; + goto fail; + } + + table->mask = size - 1; + for (j = 0; j < size; j++) { + table->flows[j].cpu = VECLS_NO_CPU; + table->flows[j].isvalid = 0; + } + + queue = netdev->_rx + i; + + spin_lock(&vecls_dev_flow_lock); + rcu_assign_pointer(queue->vecls_ftb, table); + spin_unlock(&vecls_dev_flow_lock); + } + return ret; +fail: + vecls_dev_flow_table_cleanup(netdev, i); + return ret; +} + +static int vecls_dev_flow_table_init(void) +{ + struct vecls_netdev_info *vecls_dev; + struct net_device *ndev; + int i, err, devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + ndev = vecls_dev->netdev; + if (!ndev) + continue; + err = _vecls_dev_flow_table_init(ndev); + if (err) + goto out; + } + + return 0; +out: + for (i = 0; i < devid; i++) { + vecls_dev = get_vecls_netdev_info(i); + ndev = vecls_dev->netdev; + if (!ndev) + continue; + vecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues); + } + return err; +} + +static const struct vecls_hook_ops vecls_flow_ops = { + .vecls_flow_update = _vecls_flow_update, + .vecls_set_cpu = _vecls_set_cpu, + .vecls_timeout = _vecls_timeout, + .vecls_cfg_rxcls = NULL, +}; + +static int vecls_sock_flow_table_release(void) +{ + struct vecls_sock_flow_table *tb; + + mutex_lock(&vecls_sock_flow_mutex); + tb = rcu_dereference_protected(vecls_sock_flow_table, + lockdep_is_held(&vecls_sock_flow_mutex)); + if (tb) + rcu_assign_pointer(vecls_sock_flow_table, NULL); + mutex_unlock(&vecls_sock_flow_mutex); + synchronize_rcu(); + vfree(tb); + + return 0; +} + +int venetcls_flow_status(struct seq_file *seq, void *v) +{ + struct vecls_netdev_info *vecls_dev; + struct vecls_dev_flow_table *dtb; + struct netdev_rx_queue *queue; + struct net_device *netdev; + int devid, i, j; + unsigned long timeout; + + seq_printf(seq, "%-16s %-6s %-12s %-12s %-12s\n", + "Interface", "rxq", "flowCPU", "filterId", "timeout"); + spin_lock(&vecls_dev_flow_lock); + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + netdev = vecls_dev->netdev; + if (!netdev) + continue; + for (i = 0; i < netdev->num_rx_queues; i++) { + queue = netdev->_rx + i; + dtb = rcu_dereference_protected(queue->vecls_ftb, + lockdep_is_held(&vecls_dev_flow_lock)); + if (!dtb) + continue; + for (j = 0; j < VECLS_DEV_FLOW_TABLE_NUM; j++) { + if (dtb->flows[j].cpu == VECLS_NO_CPU) + continue; + if (dtb->flows[j].isvalid == 0) + continue; + timeout = dtb->flows[j].timeout + VECLS_TIMEOUT; + if (time_before(jiffies, timeout)) { + seq_printf(seq, "%-16s %-6d %-12d %-12d %-12u\n", + vecls_dev->dev_name, i, dtb->flows[j].cpu, + dtb->flows[j].filter, + jiffies_to_msecs(timeout - jiffies)); + } + } + } + } + spin_unlock(&vecls_dev_flow_lock); + + return 0; +} + +static int vecls_sock_flow_table_init(void) +{ + struct vecls_sock_flow_table *table; + int size = VECLS_SOCK_FLOW_TABLE_NUM; + int i; + + size = roundup_pow_of_two(size); + table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size)); + if (!table) + return -ENOMEM; + + vecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + vecls_debug("nr_cpu_ids:%d, vecls_cpu_mask:0x%x\n", nr_cpu_ids, vecls_cpu_mask); + + table->mask = size - 1; + for (i = 0; i < size; i++) + table->ents[i] = VECLS_NO_CPU; + + mutex_lock(&vecls_sock_flow_mutex); + rcu_assign_pointer(vecls_sock_flow_table, table); + mutex_unlock(&vecls_sock_flow_mutex); + + return 0; +} + +int vecls_flow_res_init(void) +{ + int err; + + err = vecls_sock_flow_table_init(); + if (err) + return err; + err = vecls_dev_flow_table_init(); + if (err) + goto clean; + + RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); + synchronize_rcu(); + + return 0; +clean: + vecls_sock_flow_table_release(); + return err; +} + +void vecls_flow_res_clean(void) +{ + RCU_INIT_POINTER(vecls_ops, NULL); + synchronize_rcu(); + vecls_sock_flow_table_release(); + vecls_dev_flow_table_release(); +} diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c new file mode 100644 index 000000000000..856ff3b4427e --- /dev/null +++ b/net/venetcls/venetcls_main.c @@ -0,0 +1,1154 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netdev_features.h> +#include <linux/ethtool.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/proc_fs.h> +#include <linux/rtnetlink.h> +#include <linux/seq_file.h> +#include "venetcls.h" + +int vecls_netdev_num; +static struct vecls_netdev_info vecls_netdev_info_table[VECLS_MAX_NETDEV_NUM]; + +int vecls_numa_num; +static int vecls_cluster_cpu_num, vecls_cluster_per_numa; +static struct vecls_numa_info *vecls_numa_info_table; + +int debug; +module_param(debug, int, 0644); +MODULE_PARM_DESC(debug, "debug switch"); + +static int mode; +module_param(mode, int, 0444); +MODULE_PARM_DESC(mode, "mode, default 0"); + +static char ifname[64] = { 0 }; +module_param_string(ifname, ifname, sizeof(ifname), 0444); +MODULE_PARM_DESC(ifname, "ifname"); + +static char appname[64] = "redis-server"; +module_param_string(appname, appname, sizeof(appname), 0644); +MODULE_PARM_DESC(appname, "appname, default redis-server"); + +int match_ip_flag = 1; +module_param(match_ip_flag, int, 0644); +MODULE_PARM_DESC(match_ip_flag, "match ip flag"); + +static int strategy; +module_param(strategy, int, 0444); +MODULE_PARM_DESC(strategy, "strategy, default 0"); + +static int rxq_multiplex_limit = 1; +module_param(rxq_multiplex_limit, int, 0444); +MODULE_PARM_DESC(rxq_multiplex_limit, "rxq multiplex limit num, default 1"); + +static char irqname[64] = "comp"; +module_param_string(irqname, irqname, sizeof(irqname), 0644); +MODULE_PARM_DESC(irqname, "nic irq name string, default comp"); + +static bool check_params(void) +{ + if (mode != 0 && mode != 1) + return false; + + if (strlen(ifname) == 0) + return false; + + return true; +} + +int check_appname(char *task_name) +{ + char *start = appname, *end; + + if (!strlen(appname)) + return 0; + + // support appname: app1#app2#appN + while (*start != '\0') { + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + if (!end) { + if (!strncmp(task_name, start, strlen(start))) + return 0; + break; + } + + if (!strncmp(task_name, start, end - start)) + return 0; + start = end + 1; + } + return -EOPNOTSUPP; +} + +static u32 __ethtool_get_flags(struct net_device *dev) +{ + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) + flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) + flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) + flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) + flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) + flags |= ETH_FLAG_RXHASH; + + return flags; +} + +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + netdev_features_t features = 0, changed; + + if (data & ~ETH_ALL_FLAGS) + return -EINVAL; + + if (data & ETH_FLAG_LRO) + features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_RX; + if (data & ETH_FLAG_TXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_TX; + if (data & ETH_FLAG_NTUPLE) + features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) + features |= NETIF_F_RXHASH; + + /* allow changing only bits set in hw_features */ + changed = (features ^ dev->features) & ETH_ALL_FEATURES; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (features & changed); + + __netdev_update_features(dev); + + return 0; +} + +static void ethtool_rxnfc_copy_to_user(void *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + memcpy_r(useraddr, rxnfc, size); + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + + if (rule_buf) + memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32)); +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + int rc; + + if (!dev->ethtool_ops->set_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (rc) + return rc; + + if (cmd == ETHTOOL_SRXCLSRLINS) + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); + + return 0; +} + +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + memcpy_r(&info, useraddr, info_size); + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info.flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info.cmd != cmd) + return -EINVAL; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kcalloc(info.rule_cnt, sizeof(u32), + GFP_KERNEL); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); +err_out: + kfree(rule_buf); + + return ret; +} + +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + memcpy_r(useraddr, &channels, sizeof(channels)); + return 0; +} + +static int ethtool_get_value(struct net_device *dev, char *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { .cmd = cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + memcpy_r(useraddr, &edata, sizeof(edata)); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + memcpy_r(&edata, useraddr, sizeof(edata)); + + return actor(dev, edata.data); +} + +static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + void *useraddr = ifr->ifr_data; + u32 ethcmd, sub_cmd; + int rc; + netdev_features_t old_features; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + memcpy_r(ðcmd, useraddr, sizeof(ethcmd)); + + if (ethcmd == ETHTOOL_PERQUEUE) + memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)); + else + sub_cmd = ethcmd; + + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) + return rc; + } + old_features = dev->features; + + switch (ethcmd) { + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + __ethtool_get_flags); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); + break; + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + + return rc; +} + +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd) +{ + struct ifreq ifr = {0}; + int ret; + + strscpy(ifr.ifr_name, ctx->netdev, IFNAMSIZ); + ifr.ifr_data = cmd; + + rtnl_lock(); + ret = dev_ethtool_kern(&init_net, &ifr); + rtnl_unlock(); + + return ret; +} + +struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index) +{ + if (index >= VECLS_MAX_NETDEV_NUM) + return NULL; + return &vecls_netdev_info_table[index]; +} + +static struct vecls_netdev_info *alloc_vecls_netdev_info(void) +{ + if (vecls_netdev_num >= VECLS_MAX_NETDEV_NUM) + return NULL; + + return &vecls_netdev_info_table[vecls_netdev_num++]; +} + +static bool check_irq_name(const char *irq_name, struct vecls_netdev_info *vecls_dev) +{ + if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx") && + strlen(irqname) > 0 && !strstr(irq_name, irqname)) + return false; + + if (strstr(irq_name, vecls_dev->dev_name)) + return true; + + if (vecls_dev->netdev->dev.parent && + strstr(irq_name, dev_name(vecls_dev->netdev->dev.parent))) + return true; + + return false; +} + +static void get_netdev_queue_info(struct vecls_netdev_info *vecls_dev) +{ + struct vecls_netdev_queue_info *rxq_info; + struct irq_desc *desc; + int irq, cpu; + + for_each_irq_desc(irq, desc) { + if (!desc->action) + continue; + if (!desc->action->name) + continue; + if (!check_irq_name(desc->action->name, vecls_dev)) + continue; + if (vecls_dev->rxq_num >= VECLS_MAX_RXQ_NUM_PER_DEV) + break; + rxq_info = &vecls_dev->rxq[vecls_dev->rxq_num++]; + rxq_info->irq = irq; + cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data)); + rxq_info->affinity_cpu = cpu; + vecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n", + irq, desc->action->name, vecls_dev->rxq_num - 1, cpu); + } +} + +static int vecls_filter_enable(const char *dev_name, bool *old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + int ret; + + strscpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (eval.data & ETH_FLAG_NTUPLE) { + *old_state = true; + vecls_debug("%s ntuple is already on\n", dev_name); + return 0; + } + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + eval.data |= ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + + // Get ntuple feature + eval.cmd = ETHTOOL_GFLAGS; + eval.data = 0; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (!(eval.data & ETH_FLAG_NTUPLE)) { + vecls_error("enable ntuple feature fail!\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static void vecls_filter_restore(const char *dev_name, bool old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + bool cur_filter_state; + int ret; + + strscpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return; + } + + cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false; + if (cur_filter_state == old_state) + return; + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + if (old_state) + eval.data |= ETH_FLAG_NTUPLE; + else + eval.data &= ~ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + vecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return; + } +} + +static int init_single_vecls_dev(char *if_name, unsigned int length) +{ + struct vecls_netdev_info *vecls_dev; + char dev_name[IFNAMSIZ] = { 0 }; + struct net_device *netdev; + bool old_state = false; + int ret; + + strscpy(dev_name, if_name, IFNAMSIZ); + netdev = dev_get_by_name(&init_net, dev_name); + if (!netdev) { + vecls_error("dev [%s] is not exist!\n", dev_name); + return -ENODEV; + } + + if (!(netdev->flags & IFF_UP)) { + ret = -ENETDOWN; + vecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags); + goto out; + } + + if (netdev->flags & IFF_LOOPBACK) { + ret = -EOPNOTSUPP; + vecls_error("Do not support loopback.\n"); + goto out; + } + + ret = vecls_filter_enable(dev_name, &old_state); + if (ret) { + vecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); + goto out; + } + + vecls_dev = alloc_vecls_netdev_info(); + if (!vecls_dev) { + ret = -ENOMEM; + vecls_filter_restore(dev_name, old_state); + vecls_error("alloc vecls_dev fail! vecls_netdev_num:%d\n", vecls_netdev_num); + goto out; + } + + memcpy_r(vecls_dev->dev_name, dev_name, IFNAMSIZ); + vecls_dev->old_filter_state = old_state; + vecls_dev->netdev = netdev; + get_netdev_queue_info(vecls_dev); + return 0; + +out: + dev_put(netdev); + return ret; +} + +static void clean_vecls_netdev_info(void) +{ + struct vecls_netdev_info *vecls_dev; + struct net_device *netdev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + vecls_filter_restore(vecls_dev->dev_name, vecls_dev->old_filter_state); + netdev = vecls_dev->netdev; + if (netdev) { + vecls_dev->netdev = NULL; + dev_put(netdev); + } + } + + vecls_netdev_num = 0; +} + +static int init_vecls_netdev_info(char *netdev_str) +{ + char *start = netdev_str, *end; + int err = -ENODEV; + + while (*start != '\0') { + // skip start # + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + // find the last ifname + if (!end) { + err = init_single_vecls_dev(start, strlen(start)); + break; + } + + err = init_single_vecls_dev(start, end - start); + if (err) + break; + start = end + 1; + } + + return err; +} + +struct vecls_numa_info *get_vecls_numa_info(unsigned int nid) +{ + if (nid >= vecls_numa_num) + return NULL; + return &vecls_numa_info_table[nid]; +} + +static void clean_vecls_numa_info(void) +{ + vecls_numa_num = 0; + kfree(vecls_numa_info_table); +} + +static void init_numa_avail_cpus(int nid, struct vecls_numa_info *numa_info) +{ + int cpu; + + vecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)), + cpumask_pr_args(cpumask_of_node(nid))); + + bitmap_zero(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); + for_each_cpu(cpu, cpumask_of_node(nid)) { + if (cpu >= VECLS_MAX_CPU_NUM) + return; + set_bit(cpu, numa_info->avail_cpus); + } +} + +static void clean_vecls_rxq(void) +{ + struct vecls_numa_bound_dev_info *bound_dev; + struct vecls_netdev_info *vecls_dev; + struct vecls_numa_info *numa_info; + int nid, devid; + + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + bound_dev = &numa_info->bound_dev[devid]; + kfree(bound_dev->cluster_info); + } + } +} + +static int init_numa_rxq_bitmap(int nid, struct vecls_numa_info *numa_info) +{ + int bound_rxq_num, cluster_id, cluster_idx, cur_idx; + struct vecls_numa_bound_dev_info *bound_dev; + struct vecls_netdev_info *vecls_dev; + int i, j, rxq_id, devid, cpu, ret = 0; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + bound_rxq_num = 0; + bound_dev = &numa_info->bound_dev[devid]; + memset(bound_dev->bitmap_rxq, RXQ_MAX_USECNT, sizeof(bound_dev->bitmap_rxq)); + bound_dev->cluster_info = kcalloc(vecls_cluster_per_numa, + sizeof(*bound_dev->cluster_info), GFP_ATOMIC); + if (!bound_dev->cluster_info) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < vecls_cluster_per_numa; i++) { + for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { + bound_dev->cluster_info[i].rxqs[j].rxq_id = -1; + bound_dev->cluster_info[i].rxqs[j].status = RXQ_MAX_USECNT; + } + } + + for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { + cpu = vecls_dev->rxq[rxq_id].affinity_cpu; + if (cpu_to_node(cpu) == nid) { + bound_dev->bitmap_rxq[rxq_id] = 0; + cluster_id = cpu / vecls_cluster_cpu_num; + cluster_idx = cluster_id % vecls_cluster_per_numa; + bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id; + cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++; + bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id; + bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 0; + bound_rxq_num++; + vecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n", + cpu, cluster_id, cluster_idx, rxq_id, cur_idx); + } + } + + vecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bound_rxq_num:%d\n", + nid, devid, vecls_dev->dev_name, vecls_dev->rxq_num, bound_rxq_num); + } + return ret; + +out: + clean_vecls_rxq(); + return ret; +} + +static int get_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int cpu) +{ + int cluster_id = cpu / vecls_cluster_cpu_num; + int min_used_count = RXQ_MAX_USECNT; + int i, j, rxq_id; + + for (i = 0; i < vecls_cluster_per_numa; i++) { + if (cluster_id != bound_dev->cluster_info[i].cluster_id) + continue; + for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { + if (bound_dev->cluster_info[i].rxqs[j].rxq_id == -1) + continue; + if (bound_dev->cluster_info[i].rxqs[j].status < min_used_count) { + min_used_count = bound_dev->cluster_info[i].rxqs[j].status; + break; + } + } + if (min_used_count >= RXQ_MAX_USECNT || min_used_count >= rxq_multiplex_limit) { + rxq_id = -1; + vecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu); + } else { + rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id; + bound_dev->cluster_info[i].rxqs[j].status++; + vecls_debug("cluster:%d cpu:%d alloc rxq_id:%d use:%d\n", cluster_id, cpu, + rxq_id, bound_dev->cluster_info[i].rxqs[j].status); + } + } + vecls_debug("%s allcluster:%d rxq:%d for cpu:%d\n", __func__, cluster_id, rxq_id, cpu); + return rxq_id; +} + +static int put_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int rxq_id) +{ + int i, j; + + for (i = 0; i < vecls_cluster_per_numa; i++) { + for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { + if (bound_dev->cluster_info[i].rxqs[j].status > 0 && + bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) { + bound_dev->cluster_info[i].rxqs[j].status--; + vecls_debug("free rxq_id:%d use:%d\n", rxq_id, + bound_dev->cluster_info[i].rxqs[j].status); + return 0; + } + } + } + vecls_debug("no match malloced rxq_id:%d\n", rxq_id); + return -1; +} + +int alloc_rxq_id(int nid, int cpu, int devid) +{ + struct vecls_numa_bound_dev_info *bound_dev; + int i, rxq_id, min_used_count = RXQ_MAX_USECNT; + struct vecls_numa_info *numa_info; + + numa_info = get_vecls_numa_info(nid); + if (!numa_info) { + vecls_error("error nid:%d\n", nid); + return -EINVAL; + } + + if (devid >= VECLS_MAX_NETDEV_NUM) { + vecls_error("error bound_dev index:%d\n", devid); + return -EINVAL; + } + bound_dev = &numa_info->bound_dev[devid]; + + if (strategy == 1) { + rxq_id = get_cluster_rxq(bound_dev, cpu); + if (rxq_id < 0 || rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) + vecls_debug("failed to get rxq_id:%d in cluster, try numa\n", rxq_id); + else + goto found; + } + + for (i = 0; i < VECLS_MAX_RXQ_NUM_PER_DEV; i++) { + if (bound_dev->bitmap_rxq[i] < min_used_count) { + min_used_count = bound_dev->bitmap_rxq[i]; + rxq_id = i; + } + } + if (min_used_count >= RXQ_MAX_USECNT || min_used_count >= rxq_multiplex_limit) { + vecls_error("alloc rxq fail! nid:%d, devid:%d\n", nid, devid); + return -EINVAL; + } + +found: + bound_dev->bitmap_rxq[rxq_id]++; + vecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d use:%d\n", nid, devid, + rxq_id, bound_dev->bitmap_rxq[rxq_id]); + return rxq_id; +} + +void free_rxq_id(int nid, int devid, int rxq_id) +{ + struct vecls_numa_bound_dev_info *bound_dev; + struct vecls_numa_info *numa_info; + + numa_info = get_vecls_numa_info(nid); + if (!numa_info) { + vecls_error("error nid:%d\n", nid); + return; + } + + if (devid >= VECLS_MAX_NETDEV_NUM) { + vecls_error("error bound_dev index:%d\n", devid); + return; + } + bound_dev = &numa_info->bound_dev[devid]; + + if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) { + vecls_error("error rxq_id:%d\n", rxq_id); + return; + } + + if (strategy == 1) + put_cluster_rxq(bound_dev, rxq_id); + + if (bound_dev->bitmap_rxq[rxq_id] <= 0) { + vecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id); + return; + } + + bound_dev->bitmap_rxq[rxq_id]--; + vecls_debug("free nid:%d, dev_id:%d, rxq_id:%d use:%d\n", nid, devid, + rxq_id, bound_dev->bitmap_rxq[rxq_id]); +} + +static int init_vecls_numa_info(void) +{ + struct vecls_numa_info *numa_info; + int nid, ret = 0; + + vecls_numa_num = num_online_nodes(); + vecls_numa_info_table = kcalloc(vecls_numa_num, sizeof(*vecls_numa_info_table), GFP_ATOMIC); + if (!vecls_numa_info_table) { + ret = -ENOMEM; + vecls_error("vecls_numa_info_table alloc failed:%d\n", ret); + return ret; + } + + vecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(raw_smp_processor_id())); + vecls_cluster_per_numa = (nr_cpu_ids / vecls_cluster_cpu_num) / vecls_numa_num; + vecls_debug("vecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n", + vecls_numa_num, vecls_cluster_per_numa, vecls_cluster_cpu_num); + + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + init_numa_avail_cpus(nid, numa_info); + } + + return ret; +} + +static int alloc_available_cpu(int nid, struct vecls_numa_info *numa_info) +{ + int cpu; + + cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); + if (cpu >= VECLS_MAX_CPU_NUM) { + vecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu); + return -1; + } + + clear_bit(cpu, numa_info->avail_cpus); + return cpu; +} + +static void add_netdev_irq_affinity_cpu(struct vecls_netdev_info *vecls_dev, int rxq_id, int cpu) +{ + struct vecls_netdev_queue_info *rxq_info; + + if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) + return; + + rxq_info = &vecls_dev->rxq[rxq_id]; + rxq_info->affinity_cpu = cpu; +} + +static void config_affinity_strategy_default(struct vecls_netdev_info *vecls_dev) +{ + struct vecls_numa_info *numa_info; + int rxq_num = vecls_dev->rxq_num; + int rxq_per_numa = rxq_num / vecls_numa_num; + int remain = rxq_num - rxq_per_numa * vecls_numa_num; + int numa_rxq_id, rxq_id, nid, cpu; + + vecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", vecls_dev->dev_name, + rxq_num, rxq_per_numa, remain); + + // average config rxq to every numa + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) { + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * nid + numa_rxq_id; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu); + } + } + + if (!remain) + return; + + // config remain rxq to every numa + numa_rxq_id = 0; + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + if (numa_rxq_id >= remain) + break; + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * vecls_numa_num + numa_rxq_id; + numa_rxq_id++; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu); + } +} + +static void config_affinity_strategy_cluster(struct vecls_netdev_info *vecls_dev) +{ + int rxq_num = vecls_dev->rxq_num; + int rxq_per_numa = rxq_num / vecls_numa_num; + int remain = rxq_num - rxq_per_numa * vecls_numa_num; + int cpu_idx = vecls_cluster_cpu_num - 1; + int cluster, cpu, rxq_id = 0, round; + + round = rxq_per_numa < vecls_cluster_per_numa ? rxq_per_numa : vecls_cluster_per_numa; + if (remain > 0) + round++; + vecls_debug("round=%d\n", round); + + while (rxq_id < vecls_dev->rxq_num) { + for (cluster = 0; cluster < vecls_cluster_per_numa * vecls_numa_num; cluster++) { + if (cluster % vecls_cluster_per_numa >= round) + continue; + cpu = cluster * vecls_cluster_cpu_num + cpu_idx; + if (rxq_id >= vecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); + } + cpu_idx--; + if (--cpu_idx < 0) + cpu_idx = vecls_cluster_cpu_num - 1; + } +} + +static void config_affinity_strategy_numa(struct vecls_netdev_info *vecls_dev) +{ + int rxq_num = vecls_dev->rxq_num; + int rxq_per_numa = rxq_num / vecls_numa_num; + int cpu_per_numa = nr_cpu_ids / vecls_numa_num; + int remain = rxq_num - rxq_per_numa * vecls_numa_num; + struct vecls_numa_info *numa_info; + int numa_start_cpu, numa_cpu_id; + int rxq_id = 0, nid, cpu; + + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + numa_start_cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); + for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) { + cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); + if (rxq_id >= vecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); + } + if (remain-- > 0) { + cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); + add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); + } + } +} + +static void config_affinity_strategy_custom(struct vecls_netdev_info *vecls_dev) +{ + vecls_debug("dev=%s\n", vecls_dev->dev_name); +} + +static void config_affinity_strategy(void) +{ + struct vecls_netdev_info *vecls_dev; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + switch (strategy) { + case 1: + config_affinity_strategy_cluster(vecls_dev); + break; + case 2: + config_affinity_strategy_numa(vecls_dev); + break; + case 3: + config_affinity_strategy_custom(vecls_dev); + break; + case 0: + default: + config_affinity_strategy_default(vecls_dev); + break; + } + } +} + +static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu) +{ + int err = 0; + + err = irq_set_affinity(irq, get_cpu_mask(cpu)); + vecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err); +} + +static void enable_affinity_strategy(void) +{ + struct vecls_netdev_queue_info *rxq_info; + struct vecls_netdev_info *vecls_dev; + int rxq_id, devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { + rxq_info = &vecls_dev->rxq[rxq_id]; + irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu); + } + } +} + +static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id, + const struct cpumask *cpu_mask) +{ + int err = 0; + + err = netif_set_xps_queue(netdev, cpu_mask, rxq_id); + vecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id, + cpumask_pr_args(cpu_mask), err); +} + +static void set_netdev_xps_queue(bool enable) +{ + const struct cpumask clear_mask = { 0 }; + struct vecls_netdev_info *vecls_dev; + const struct cpumask *cpu_mask; + int rxq_id, devid, cpu, nid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { + cpu = vecls_dev->rxq[rxq_id].affinity_cpu; + nid = cpu_to_node(cpu); + if (enable) + cpu_mask = cpumask_of_node(nid); + else + cpu_mask = &clear_mask; + + netif_set_xps_queue_wrapper(vecls_dev->netdev, rxq_id, cpu_mask); + } + } +} + +static int __maybe_unused venetcls_status_seq_show(struct seq_file *seq, void *v) +{ + int err; + + if (mode == 0) + err = venetcls_ntuple_status(seq, v); + else + err = venetcls_flow_status(seq, v); + return err; +} + +static __init int vecls_init(void) +{ + struct vecls_numa_info *numa_info; + int nid, err; + + if (!check_params()) + return -EINVAL; + + err = init_vecls_numa_info(); + if (err) + return err; + + err = init_vecls_netdev_info(ifname); + if (err) + goto clean_numa; + + // Set irq affinity + config_affinity_strategy(); + enable_affinity_strategy(); + + // Calculate rxq bounded to one numa + for (nid = 0; nid < vecls_numa_num; nid++) { + numa_info = get_vecls_numa_info(nid); + if (!numa_info) + continue; + err = init_numa_rxq_bitmap(nid, numa_info); + if (err) + goto clean_rxq; + } + +#ifdef CONFIG_XPS + set_netdev_xps_queue(true); +#endif + + if (mode == 0) + err = vecls_ntuple_res_init(); + else + err = vecls_flow_res_init(); + + if (err) + goto clean_rxq; + +#ifdef CONFIG_PROC_FS + if (!proc_create_net_single("venet_status", 0444, init_net.proc_net, + venetcls_status_seq_show, NULL)) { + err = -ENOMEM; + goto clean_rxq; + } +#endif + + return 0; + +clean_rxq: +clean_numa: + clean_vecls_netdev_info(); + clean_vecls_numa_info(); + return err; +} + +static __exit void vecls_exit(void) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("venet_status", init_net.proc_net); +#endif + if (mode == 0) + vecls_ntuple_res_clean(); + else + vecls_flow_res_clean(); + +#ifdef CONFIG_XPS + set_netdev_xps_queue(false); +#endif + + clean_vecls_rxq(); + clean_vecls_netdev_info(); + clean_vecls_numa_info(); +} + +module_init(vecls_init); +module_exit(vecls_exit); + +MODULE_DESCRIPTION("venetcls"); +MODULE_LICENSE("GPL"); diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c new file mode 100644 index 000000000000..ad3c10f8ae5f --- /dev/null +++ b/net/venetcls/venetcls_ntuple.c @@ -0,0 +1,713 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/inetdevice.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/inet.h> +#include <linux/jhash.h> +#include <linux/venetcls.h> +#include <net/addrconf.h> +#include <net/sock.h> + +#include "venetcls.h" + +struct vecls_sk_rule_list vecls_sk_rules, vecls_sk_list; +static struct workqueue_struct *do_cfg_workqueue; +static atomic_t vecls_worker_count = ATOMIC_INIT(0); + +static void init_vecls_sk_rules(void) +{ + unsigned int i; + + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) + INIT_HLIST_HEAD(vecls_sk_rules.hash + i); + mutex_init(&vecls_sk_rules.mutex); +} + +static inline u32 get_hash(struct cmd_context ctx) +{ + u32 hash; + + if (ctx.is_ipv6) + hash = jhash_2words(jhash(ctx.dip6, 16, 0), ctx.dport, 0); + else + hash = jhash_2words(ctx.dip4, ctx.dport, 0); + + return hash; +} + +static inline struct hlist_head *get_rule_hashlist(struct cmd_context ctx) +{ + u32 hash; + + hash = get_hash(ctx); + return vecls_sk_rules.hash + (hash & VECLS_SK_RULE_HASHMASK); +} + +static inline struct hlist_head *get_sk_hashlist(void *sk) +{ + return vecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & VECLS_SK_RULE_HASHMASK); +} + +static void add_sk_rule(int devid, struct cmd_context ctx, void *sk, int nid) +{ + struct hlist_head *hlist = get_rule_hashlist(ctx); + struct hlist_head *sk_hlist = get_sk_hashlist(sk); + struct vecls_sk_rule *rule; + struct vecls_sk_entry *entry; + + rule = kzalloc(sizeof(*rule), GFP_ATOMIC); + if (!rule) { + vecls_error("alloc rule failed\n"); + return; + } + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + vecls_error("alloc entry failed\n"); + kfree(rule); + return; + } + + rule->sk = sk; + rule->is_ipv6 = ctx.is_ipv6; + rule->dip4 = ctx.dip4; + memcpy(rule->dip6, ctx.dip6, sizeof(rule->dip6)); + rule->dport = ctx.dport; + rule->devid = devid; + rule->action = ctx.action; + rule->ruleid = ctx.ret_loc; + rule->nid = nid; + hlist_add_head(&rule->node, hlist); + + entry->sk = sk; + entry->sk_rule_hash = get_hash(ctx); + hlist_add_head(&entry->node, sk_hlist); +} + +static struct vecls_sk_entry *get_sk_entry(void *sk) +{ + struct hlist_head *sk_hlist = get_sk_hashlist(sk); + struct vecls_sk_entry *entry = NULL; + + hlist_for_each_entry(entry, sk_hlist, node) { + if (entry->sk == sk) + break; + } + return entry; +} + +static void del_sk_rule(struct vecls_sk_rule *rule) +{ + struct vecls_sk_entry *entry; + + entry = get_sk_entry(rule->sk); + if (!entry) + return; + hlist_del_init(&entry->node); + kfree(entry); + + vecls_debug("del rule=%p\n", rule); + hlist_del_init(&rule->node); + kfree(rule); +} + +static struct vecls_sk_rule *get_sk_rule(int devid, struct cmd_context ctx) +{ + struct hlist_head *hlist = get_rule_hashlist(ctx); + struct vecls_sk_rule *rule = NULL; + + hlist_for_each_entry(rule, hlist, node) { + if (rule->devid != devid || rule->dport != ctx.dport) + continue; + if (!rule->is_ipv6 && rule->dip4 == ctx.dip4) + break; + if (rule->is_ipv6 && !memcmp(rule->dip6, ctx.dip6, sizeof(rule->dip6))) + break; + } + return rule; +} + +static struct vecls_sk_rule *get_rule_from_sk(int devid, void *sk) +{ + struct vecls_sk_rule *rule = NULL; + struct vecls_sk_entry *entry; + struct hlist_head *hlist; + + entry = get_sk_entry(sk); + if (!entry) + return NULL; + + hlist = vecls_sk_rules.hash + (entry->sk_rule_hash & VECLS_SK_RULE_HASHMASK); + hlist_for_each_entry(rule, hlist, node) { + if (rule->devid == devid && rule->sk == sk) + break; + } + return rule; +} + +static inline bool reuseport_check(int devid, struct cmd_context ctx) +{ + return !!get_sk_rule(devid, ctx); +} + +static u32 get_first_ip4_addr(struct net *net) +{ + struct in_device *in_dev; + struct net_device *dev; + struct in_ifaddr *ifa; + u32 dip4 = 0; + + rtnl_lock(); + rcu_read_lock(); + for_each_netdev(net, dev) { + if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) + continue; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!strcmp(dev->name, ifa->ifa_label)) { + dip4 = ifa->ifa_local; + vecls_debug("dev:%s dip:%pI4\n", dev->name, &dip4); + goto out; + } + } + } +out: + rcu_read_unlock(); + rtnl_unlock(); + return dip4; +} + +static void get_first_ip6_addr(struct net *net, u32 *dip6) +{ + struct inet6_dev *idev; + struct net_device *dev; + struct inet6_ifaddr *ifp; + + rtnl_lock(); + rcu_read_lock(); + for_each_netdev(net, dev) { + if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) + continue; + idev = __in6_dev_get(dev); + if (!idev) + continue; + list_for_each_entry_rcu(ifp, &idev->addr_list, if_list) { + if (ifp->scope == RT_SCOPE_HOST) + continue; + if (ifp->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) + continue; + memcpy(dip6, &ifp->addr, sizeof(ifp->addr)); + vecls_debug("dev:%s dip:%pI6\n", dev->name, dip6); + goto out; + } + } +out: + rcu_read_unlock(); + rtnl_unlock(); +} + +static void get_sk_rule_addr(struct sock *sk, struct cfg_param *ctx_p) +{ + bool is_ipv6 = !!(sk->sk_family == AF_INET6); + u16 *dport = &ctx_p->ctx.dport; + u32 *dip4 = &ctx_p->ctx.dip4; + u32 *dip6 = &ctx_p->ctx.dip6[0]; + + *dport = htons(sk->sk_num); + ctx_p->ctx.is_ipv6 = is_ipv6; + + if (!match_ip_flag) { + *dip4 = 0; + memset(dip6, 0, sizeof(sk->sk_v6_rcv_saddr)); + return; + } + + if (is_ipv6) { + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + memcpy(dip6, &sk->sk_v6_rcv_saddr, sizeof(sk->sk_v6_rcv_saddr)); + else + get_first_ip6_addr(sock_net(sk), dip6); + + } else { + if (sk->sk_rcv_saddr) + *dip4 = sk->sk_rcv_saddr; + else + *dip4 = get_first_ip4_addr(sock_net(sk)); + } +} + +static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_SRXCLSRLDEL; + nfccmd.fs.location = loc; + err = send_ethtool_ioctl(ctx, &nfccmd); + if (err < 0) + vecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc); + return err; +} + +static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc) +{ + if (loc >= rmgr->size) { + vecls_error("rmgr: Location out of range\n"); + return -1; + } + + set_bit(loc, rmgr->slot); + return 0; +} + +static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp) +{ + __u32 loc, slot_num; + + if (rmgr->driver_select) + return 0; + + loc = rmgr->size - 1; + slot_num = loc / BITS_PER_LONG; + if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) { + loc -= 1 + (loc % BITS_PER_LONG); + slot_num--; + } + + while (loc < rmgr->size && !~(rmgr->slot[slot_num])) { + loc -= BITS_PER_LONG; + slot_num--; + } + + while (loc < rmgr->size && test_bit(loc, rmgr->slot)) + loc--; + + if (loc < rmgr->size) { + fsp->location = loc; + return rmgr_ins(rmgr, loc); + } + + return -1; +} + +static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_GRXCLSRLCNT; + nfccmd.data = 0; + err = send_ethtool_ioctl(ctx, &nfccmd); + *count = nfccmd.rule_cnt; + if (driver_select) + *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL); + if (err < 0) + vecls_debug("rxclass: Cannot get RX class rule count\n"); + + return err; +} + +static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr) +{ + struct ethtool_rxnfc *nfccmd; + __u32 *rule_locs; + int i, err = 0; + + memset(rmgr, 0, sizeof(*rmgr)); + err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select); + if (err < 0) + return err; + + if (rmgr->driver_select) + return err; + + nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC); + if (!nfccmd) { + vecls_error("rmgr: Cannot allocate memory for RX class rule locations\n"); + err = -ENOMEM; + goto out; + } + + nfccmd->cmd = ETHTOOL_GRXCLSRLALL; + nfccmd->rule_cnt = rmgr->n_rules; + err = send_ethtool_ioctl(ctx, nfccmd); + if (err < 0) { + vecls_debug("rmgr: Cannot get RX class rules\n"); + goto out; + } + + rmgr->size = nfccmd->data; + if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) { + vecls_error("rmgr: Invalid RX class rules table size\n"); + err = -EINVAL; + goto out; + } + + rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC); + if (!rmgr->slot) { + vecls_error("rmgr: Cannot allocate memory for RX class rules\n"); + err = -ENOMEM; + goto out; + } + + rule_locs = nfccmd->rule_locs; + for (i = 0; i < rmgr->n_rules; i++) { + err = rmgr_ins(rmgr, rule_locs[i]); + if (err < 0) + break; + } + +out: + kfree(nfccmd); + return err; +} + +static void rmgr_cleanup(struct rmgr_ctrl *rmgr) +{ + kfree(rmgr->slot); + rmgr->slot = NULL; + rmgr->size = 0; +} + +static int rmgr_set_location(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp) +{ + struct rmgr_ctrl rmgr; + int ret; + + ret = rmgr_init(ctx, &rmgr); + if (ret < 0) + goto out; + + ret = rmgr_find_empty_slot(&rmgr, fsp); +out: + rmgr_cleanup(&rmgr); + return ret; +} + +static int rxclass_rule_ins(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp, u32 rss_context) +{ + struct ethtool_rxnfc nfccmd; + u32 loc = fsp->location; + int ret; + + if (loc & RX_CLS_LOC_SPECIAL) { + ret = rmgr_set_location(ctx, fsp); + if (ret < 0) + return ret; + } + + nfccmd.cmd = ETHTOOL_SRXCLSRLINS; + nfccmd.rss_context = rss_context; + nfccmd.fs = *fsp; + ret = send_ethtool_ioctl(ctx, &nfccmd); + if (ret < 0) { + vecls_debug("Can not insert the clasification rule\n"); + return ret; + } + + if (loc & RX_CLS_LOC_SPECIAL) + vecls_debug("Added rule with ID %d\n", nfccmd.fs.location); + + return 0; +} + +static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del) +{ + struct ethtool_rx_flow_spec *fsp, rx_rule_fs; + u32 rss_context = 0; + bool is_ipv6 = ctx->is_ipv6; + int ret, i; + + if (ctx->is_ipv6) + vecls_debug("del:%d dev:%s dip:%pI6 dport:%d action:%d ruleid:%u del_ruleid:%u\n", + is_del, ctx->netdev, &ctx->dip6, ntohs(ctx->dport), ctx->action, + ctx->ruleid, ctx->del_ruleid); + else + vecls_debug("del:%d dev:%s dip:%pI4 dport:%d action:%d ruleid:%u del_ruleid:%u\n", + is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, + ctx->ruleid, ctx->del_ruleid); + + if (is_del) + return rxclass_rule_del(ctx, ctx->del_ruleid); + + ctx->ret_loc = -1; + + fsp = &rx_rule_fs; + memset(fsp, 0, sizeof(*fsp)); + if (is_ipv6) { + fsp->flow_type = TCP_V6_FLOW; + memcpy(fsp->h_u.tcp_ip6_spec.ip6dst, ctx->dip6, sizeof(ctx->dip6)); + fsp->h_u.tcp_ip6_spec.pdst = ctx->dport; + fsp->m_u.tcp_ip6_spec.pdst = (u16)~0ULL; + if (ctx->dip6[0] | ctx->dip6[1] | ctx->dip6[2] | ctx->dip6[3]) { + for (i = 0; i < 4; i++) + fsp->m_u.tcp_ip6_spec.ip6dst[i] = (u32)~0ULL; + } + } else { + fsp->flow_type = TCP_V4_FLOW; + fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4; + fsp->h_u.tcp_ip4_spec.pdst = ctx->dport; + fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL; + if (ctx->dip4) + fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL; + } + fsp->location = RX_CLS_LOC_ANY; + if (ctx->ruleid) + fsp->location = ctx->ruleid; + fsp->ring_cookie = ctx->action; + + ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context); + if (!ret) + ctx->ret_loc = rx_rule_fs.location; + return ret; +} + +static void cfg_work(struct work_struct *work) +{ + struct cfg_param *ctx_p = container_of(work, struct cfg_param, work); + struct vecls_netdev_info *vecls_dev; + struct vecls_sk_rule *rule; + int devid, rxq_id, err; + + mutex_lock(&vecls_sk_rules.mutex); + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + strscpy(ctx_p->ctx.netdev, vecls_dev->dev_name, IFNAMSIZ); + if (!(ctx_p->is_del)) { + if (reuseport_check(devid, ctx_p->ctx)) { + if (ctx_p->ctx.is_ipv6) + vecls_debug("dip:%pI6, dport:%d reuse!\n", + &ctx_p->ctx.dip6, ntohs(ctx_p->ctx.dport)); + else + vecls_debug("dip:%pI4, dport:%d reuse!\n", + &ctx_p->ctx.dip4, ntohs(ctx_p->ctx.dport)); + continue; + } + + // Calculate the bound queue + rxq_id = alloc_rxq_id(ctx_p->nid, ctx_p->cpu, devid); + if (rxq_id < 0) + continue; + + // Config Ntuple rule to dev + ctx_p->ctx.action = (u16)rxq_id; + err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); + if (err) { + vecls_debug("Add sk:%p, dev_id:%d, rxq:%d, err:%d\n", + ctx_p->sk, devid, rxq_id, err); + free_rxq_id(ctx_p->nid, devid, rxq_id); + continue; + } + add_sk_rule(devid, ctx_p->ctx, ctx_p->sk, ctx_p->nid); + } else { + rule = get_rule_from_sk(devid, ctx_p->sk); + if (!rule) { + vecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", + ctx_p->sk, devid, &ctx_p->ctx.dip4, + ntohs(ctx_p->ctx.dport)); + continue; + } + + // Config Ntuple rule to dev + ctx_p->ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); + // Free the bound queue + free_rxq_id(rule->nid, devid, rule->action); + // Delete sk rule + del_sk_rule(rule); + } + } + mutex_unlock(&vecls_sk_rules.mutex); + kfree(ctx_p); + atomic_dec(&vecls_worker_count); +} + +static bool has_sock_rule(struct sock *sk) +{ + struct vecls_netdev_info *vecls_dev; + struct vecls_sk_rule *rule; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + rule = get_rule_from_sk(devid, sk); + if (rule) + return true; + } + return false; +} + +static void del_ntuple_rule(struct sock *sk) +{ + struct cfg_param *ctx_p; + + if (!has_sock_rule(sk)) + return; + + ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); + if (!ctx_p) + return; + get_sk_rule_addr(sk, ctx_p); + + ctx_p->is_del = true; + ctx_p->sk = sk; + INIT_WORK(&ctx_p->work, cfg_work); + queue_work(do_cfg_workqueue, &ctx_p->work); + atomic_inc(&vecls_worker_count); +} + +static void add_ntuple_rule(struct sock *sk) +{ + struct cfg_param *ctx_p; + int cpu = raw_smp_processor_id(); + int nid = cpu_to_node(cpu); + + if (check_appname(current->comm)) + return; + + ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); + if (!ctx_p) + return; + get_sk_rule_addr(sk, ctx_p); + + ctx_p->is_del = false; + ctx_p->sk = sk; + ctx_p->nid = nid; + ctx_p->cpu = cpu; + INIT_WORK(&ctx_p->work, cfg_work); + queue_work(do_cfg_workqueue, &ctx_p->work); + atomic_inc(&vecls_worker_count); +} + +static void ethtool_cfg_rxcls(struct sock *sk, int is_del) +{ + bool is_ipv6; + + if (sk->sk_state != TCP_LISTEN) + return; + + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return; + + is_ipv6 = !!(sk->sk_family == AF_INET6); + if (is_ipv6) + vecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, IPv6:%pI6, port:%d\n", + raw_smp_processor_id(), current->comm, sk, is_del, + &sk->sk_v6_rcv_saddr, (u16)sk->sk_num); + else + vecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, IPv4:%pI4, port:%d\n", + raw_smp_processor_id(), current->comm, sk, is_del, + &sk->sk_rcv_saddr, (u16)sk->sk_num); + + if (is_del) + del_ntuple_rule(sk); + else + add_ntuple_rule(sk); +} + +static void clean_vecls_sk_rules(void) +{ + struct vecls_netdev_info *vecls_dev; + struct cmd_context ctx = { 0 }; + struct vecls_sk_rule *rule; + struct hlist_head *hlist; + struct hlist_node *n; + unsigned int i; + int err; + + mutex_lock(&vecls_sk_rules.mutex); + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { + hlist = &vecls_sk_rules.hash[i]; + + hlist_for_each_entry_safe(rule, n, hlist, node) { + vecls_dev = get_vecls_netdev_info(rule->devid); + if (!vecls_dev) + continue; + strscpy(ctx.netdev, vecls_dev->dev_name, IFNAMSIZ); + ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx, true); + vecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, + rule->devid, rule->action, rule->ruleid, err); + + hlist_del(&rule->node); + vecls_debug("clean rule=%p\n", rule); + kfree(rule); + } + } + mutex_unlock(&vecls_sk_rules.mutex); +} + +int venetcls_ntuple_status(struct seq_file *seq, void *v) +{ + struct vecls_netdev_info *vecls_dev; + struct vecls_sk_rule *rule; + struct hlist_head *hlist; + struct hlist_node *n; + unsigned int i; + + seq_printf(seq, "%-16s %-42s %-8s %-6s %-6s %-6s\n", + "Interface", "dstIP", "dstPort", "rxq", "ruleId", "NumaID"); + mutex_lock(&vecls_sk_rules.mutex); + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { + hlist = &vecls_sk_rules.hash[i]; + hlist_for_each_entry_safe(rule, n, hlist, node) { + vecls_dev = get_vecls_netdev_info(rule->devid); + if (!vecls_dev) + continue; + if (rule->is_ipv6) + seq_printf(seq, "%-16s %-42pI6 %-8d %-6d %-6d %-6d\n", + vecls_dev->dev_name, &rule->dip6, ntohs(rule->dport), + rule->action, rule->ruleid, rule->nid); + else + seq_printf(seq, "%-16s %-42pI4 %-8d %-6d %-6d %-6d\n", + vecls_dev->dev_name, &rule->dip4, ntohs(rule->dport), + rule->action, rule->ruleid, rule->nid); + } + } + mutex_unlock(&vecls_sk_rules.mutex); + + return 0; +} + +static const struct vecls_hook_ops vecls_ntuple_ops = { + .vecls_flow_update = NULL, + .vecls_set_cpu = NULL, + .vecls_timeout = NULL, + .vecls_cfg_rxcls = ethtool_cfg_rxcls, +}; + +int vecls_ntuple_res_init(void) +{ + do_cfg_workqueue = alloc_ordered_workqueue("vecls_cfg", 0); + if (!do_cfg_workqueue) { + vecls_debug("alloc_ordered_workqueue fails\n"); + return -ENOMEM; + } + + init_vecls_sk_rules(); + RCU_INIT_POINTER(vecls_ops, &vecls_ntuple_ops); + synchronize_rcu(); + return 0; +} + +void vecls_ntuple_res_clean(void) +{ + RCU_INIT_POINTER(vecls_ops, NULL); + synchronize_rcu(); + + while (atomic_read(&vecls_worker_count) != 0) + mdelay(1); + destroy_workqueue(do_cfg_workqueue); + clean_vecls_sk_rules(); +} -- 2.34.1
hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- Make hash table size configurable to fix possible hash collision Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls.h | 4 ++-- net/venetcls/venetcls_flow.c | 13 ++++++++----- net/venetcls/venetcls_main.c | 8 ++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h index 14f02cd962c3..957645e28acf 100644 --- a/net/venetcls/venetcls.h +++ b/net/venetcls/venetcls.h @@ -107,8 +107,6 @@ struct vecls_sock_flow_table { u32 ents[] ____cacheline_aligned_in_smp; }; -#define VECLS_DEV_FLOW_TABLE_NUM 0x1000 -#define VECLS_SOCK_FLOW_TABLE_NUM 0x100000 #define VECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct vecls_dev_flow_table) + \ ((_num) * sizeof(struct vecls_dev_flow))) #define VECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct vecls_sock_flow_table, ents[_num])) @@ -139,6 +137,8 @@ extern int match_ip_flag; extern int debug; extern int vecls_netdev_num; extern int vecls_numa_num; +extern unsigned int dft_num; +extern unsigned int sft_num; #define vecls_debug(fmt, ...) \ do { \ diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c index 242254b9bfe0..4ca2191d0718 100644 --- a/net/venetcls/venetcls_flow.c +++ b/net/venetcls/venetcls_flow.c @@ -237,6 +237,11 @@ static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, else newcpu = last_recv_cpu; + if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(newcpu)) { + rflow->timeout = jiffies; + return; + } + if (cpu_to_node(cpu) == cpu_to_node(newcpu)) return; @@ -329,9 +334,8 @@ static int vecls_dev_flow_table_release(void) static int _vecls_dev_flow_table_init(struct net_device *netdev) { struct vecls_dev_flow_table *table; - int size = VECLS_DEV_FLOW_TABLE_NUM; + int size = dft_num, i, j, ret = 0; struct netdev_rx_queue *queue; - int i, j, ret = 0; size = roundup_pow_of_two(size); vecls_debug("%s dev:%s num_rx_queues:%d mask:0x%x\n", @@ -440,7 +444,7 @@ int venetcls_flow_status(struct seq_file *seq, void *v) lockdep_is_held(&vecls_dev_flow_lock)); if (!dtb) continue; - for (j = 0; j < VECLS_DEV_FLOW_TABLE_NUM; j++) { + for (j = 0; j < dft_num; j++) { if (dtb->flows[j].cpu == VECLS_NO_CPU) continue; if (dtb->flows[j].isvalid == 0) @@ -463,8 +467,7 @@ int venetcls_flow_status(struct seq_file *seq, void *v) static int vecls_sock_flow_table_init(void) { struct vecls_sock_flow_table *table; - int size = VECLS_SOCK_FLOW_TABLE_NUM; - int i; + int size = sft_num, i; size = roundup_pow_of_two(size); table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size)); diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index 856ff3b4427e..30f7a2bd4570 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -49,6 +49,14 @@ static char irqname[64] = "comp"; module_param_string(irqname, irqname, sizeof(irqname), 0644); MODULE_PARM_DESC(irqname, "nic irq name string, default comp"); +unsigned int dft_num = 0x1000; +module_param(dft_num, uint, 0444); +MODULE_PARM_DESC(dft_num, "dev flow table entries, default 0x10000"); + +unsigned int sft_num = 0x100000; +module_param(sft_num, uint, 0444); +MODULE_PARM_DESC(sft_num, "sock flow table entries, default 0x100000"); + static bool check_params(void) { if (mode != 0 && mode != 1) -- 2.34.1
hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- Make VENETCLS depend on the arm64 architecture and use it default as module on the Kunpeng platform. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/venetcls/Kconfig b/net/venetcls/Kconfig index 7f2ea5c4a6b0..7ba9b35f1623 100644 --- a/net/venetcls/Kconfig +++ b/net/venetcls/Kconfig @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only config VENETCLS tristate "Network classification" - depends on MODULES - default n + depends on MODULES && ARM64 + default m help This introduces a kind of network optimization method, which can configure the flow steer rules, and bind interrupt to the netdev -- 2.34.1
hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- Use NUMA-aware flow tables for local flows to achieve better cache effectiveness and NUMA affinity. Also cache check_appname results in sk to avoid unnecessary dup check. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- include/linux/skbuff.h | 3 ++ include/linux/venetcls.h | 40 ++++++++++++-- include/net/sock.h | 3 ++ net/core/dev.c | 26 +++++---- net/core/sock.c | 3 ++ net/ipv4/tcp.c | 5 +- net/venetcls/venetcls.h | 8 +++ net/venetcls/venetcls_flow.c | 98 ++++++++++++++++++++++++++++++---- net/venetcls/venetcls_main.c | 31 ++++++++--- net/venetcls/venetcls_ntuple.c | 3 +- 10 files changed, 189 insertions(+), 31 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fb589a653837..d781ee4b8a48 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -958,6 +958,9 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; +#if IS_ENABLED(CONFIG_VENETCLS) + __u32 sym_hash; +#endif __be16 vlan_proto; __u16 vlan_tci; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h index 9cfcdd4e5766..fab7e57fde89 100644 --- a/include/linux/venetcls.h +++ b/include/linux/venetcls.h @@ -2,16 +2,20 @@ #ifndef _LINUX_VENETCLS_H #define _LINUX_VENETCLS_H +#include <linux/if_arp.h> + struct vecls_hook_ops { void (*vecls_cfg_rxcls)(struct sock *sk, int is_del); - void (*vecls_flow_update)(struct sock *sk); + void (*vecls_flow_update)(struct sock *sk, struct sk_buff *skb); void (*vecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail); + void (*vecls_set_localcpu)(struct sk_buff *skb, int *cpu, int *last_qtail); bool (*vecls_timeout)(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); }; typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); extern const struct vecls_hook_ops __rcu *vecls_ops; +extern struct static_key_false vecls_localrps_needed; static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) { @@ -24,14 +28,14 @@ static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) rcu_read_unlock(); } -static inline void venetcls_flow_update(struct sock *sk) +static inline void venetcls_flow_update(struct sock *sk, struct sk_buff *skb) { const struct vecls_hook_ops *ops; rcu_read_lock(); ops = rcu_dereference(vecls_ops); if (ops && ops->vecls_flow_update) - ops->vecls_flow_update(sk); + ops->vecls_flow_update(sk, skb); rcu_read_unlock(); } @@ -44,10 +48,18 @@ venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) rcu_read_lock(); ops = rcu_dereference(vecls_ops); - if (ops && ops->vecls_set_cpu) { + if (ops) { cpu = -1; last_qtail = 0; - ops->vecls_set_cpu(skb, &cpu, &last_qtail); + /* mode 1 always use vecls_set_cpu hook for physical NIC or lo. + * mode 0 set this hook to NULL, to avoid unneeded ops in + * venetcls_skblist_set_cpu() for physical NIC flows, and use + * vecls_set_localcpu hook for loopback flows. + */ + if (ops->vecls_set_cpu) + ops->vecls_set_cpu(skb, &cpu, &last_qtail); + else if (ops->vecls_set_localcpu) + ops->vecls_set_localcpu(skb, &cpu, &last_qtail); if (cpu >= 0) { *ret = enq_func(skb, cpu, &last_qtail); result = true; @@ -57,6 +69,24 @@ venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) return result; } +static inline bool +venetcls_skb_set_localcpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) +{ + struct net_device *dev = skb->dev; + bool result = false; + + if (!static_branch_unlikely(&vecls_localrps_needed)) + return result; + if (!dev || !(dev->type == ARPHRD_LOOPBACK && dev->flags & IFF_LOOPBACK)) + return result; + + preempt_disable(); + if (venetcls_skb_set_cpu(skb, enq_func, ret)) + result = true; + preempt_enable(); + return result; +} + static inline void venetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func) { diff --git a/include/net/sock.h b/include/net/sock.h index 7fca7acb7d30..161e3e53ff72 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -537,6 +537,9 @@ struct sock { #endif struct rcu_head sk_rcu; struct xarray sk_pagepool; +#if IS_ENABLED(CONFIG_VENETCLS) + u8 vecls_cmd_matched; +#endif }; enum sk_pacing { diff --git a/net/core/dev.c b/net/core/dev.c index 47b916ca8d46..b62fcd0a6daf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -164,6 +164,8 @@ #include <linux/venetcls.h> const struct vecls_hook_ops __rcu *vecls_ops __read_mostly; EXPORT_SYMBOL_GPL(vecls_ops); +struct static_key_false vecls_localrps_needed __read_mostly; +EXPORT_SYMBOL(vecls_localrps_needed); #endif static DEFINE_SPINLOCK(ptype_lock); @@ -5199,6 +5201,12 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); +#if IS_ENABLED(CONFIG_VENETCLS) + if (static_branch_unlikely(&vecls_localrps_needed)) { + if (venetcls_skb_set_localcpu(skb, enqueue_to_backlog, &ret)) + return ret; + } +#endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -5880,6 +5888,12 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return NET_RX_SUCCESS; rcu_read_lock(); +#if IS_ENABLED(CONFIG_VENETCLS) + if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { + rcu_read_unlock(); + return ret; + } +#endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -5891,12 +5905,6 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } } -#endif -#if IS_ENABLED(CONFIG_VENETCLS) - if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { - rcu_read_unlock(); - return ret; - } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); @@ -5918,6 +5926,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) list_splice_init(&sublist, head); rcu_read_lock(); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_skblist_set_cpu(head, enqueue_to_backlog); +#endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { @@ -5931,9 +5942,6 @@ static void netif_receive_skb_list_internal(struct list_head *head) } } } -#endif -#if IS_ENABLED(CONFIG_VENETCLS) - venetcls_skblist_set_cpu(head, enqueue_to_backlog); #endif __netif_receive_skb_list(head); rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index d104194eef9a..748f0f4e1115 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2118,6 +2118,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); +#if IS_ENABLED(CONFIG_VENETCLS) + sk->vecls_cmd_matched = 0; +#endif } return sk; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cc84873cee0d..48686eef0768 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2900,6 +2900,9 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (used + offset < skb->len) continue; +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_flow_update(sk, skb); +#endif if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) @@ -2944,7 +2947,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, return inet_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_VENETCLS) - venetcls_flow_update(sk); + venetcls_flow_update(sk, NULL); #endif if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h index 957645e28acf..05fe2e3592f4 100644 --- a/net/venetcls/venetcls.h +++ b/net/venetcls/venetcls.h @@ -15,6 +15,10 @@ #define RXQ_MAX_USECNT 0xFF +#define VECLS_CMD_UNKNOWN 0 +#define VECLS_CMD_MATCHED 1 +#define VECLS_CMD_NO_MATCH 2 + struct vecls_netdev_queue_info { int irq; int affinity_cpu; @@ -133,6 +137,8 @@ struct cfg_param { int cpu; }; +extern int lo_numa_rps; +extern int mode; extern int match_ip_flag; extern int debug; extern int vecls_netdev_num; @@ -179,5 +185,7 @@ int venetcls_ntuple_status(struct seq_file *seq, void *v); int vecls_flow_res_init(void); void vecls_flow_res_clean(void); int venetcls_flow_status(struct seq_file *seq, void *v); +void _vecls_flow_update(struct sock *sk, struct sk_buff *skb); +void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail); #endif /* _NET_VENETCLS_H */ diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c index 4ca2191d0718..85d1abe4c708 100644 --- a/net/venetcls/venetcls_flow.c +++ b/net/venetcls/venetcls_flow.c @@ -5,11 +5,17 @@ #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/inet.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <net/ipv6.h> +#include <net/inet_sock.h> #include <linux/venetcls.h> #include <net/sock.h> #include "venetcls.h" +static u16 *rps_cpus; +static int rps_cpus_nums; static u32 vecls_cpu_mask; static struct vecls_sock_flow_table __rcu *vecls_sock_flow_table; static DEFINE_MUTEX(vecls_sock_flow_mutex); @@ -61,22 +67,47 @@ static bool _vecls_timeout(struct net_device *dev, u16 rxq_index, return expire; } -static void _vecls_flow_update(struct sock *sk) +static inline bool sk_is_loopback(struct sock *sk) +{ + if (sk->sk_family == AF_INET) { + if (ipv4_is_loopback(sk->sk_daddr) && ipv4_is_loopback(sk->sk_rcv_saddr)) + return true; + } + + if (sk->sk_family == AF_INET6) { + if (ipv6_addr_loopback(&sk->sk_v6_daddr) && + ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + return true; + } + return false; +} + +void _vecls_flow_update(struct sock *sk, struct sk_buff *skb) { struct vecls_sock_flow_table *tb; unsigned int hash, index; - u32 val; - u32 cpu = raw_smp_processor_id(); + u32 val, cpu; if (sk->sk_state != TCP_ESTABLISHED) return; - if (check_appname(current->comm)) + if (unlikely(sk->vecls_cmd_matched == VECLS_CMD_UNKNOWN)) { + if (check_appname(current->comm)) { + sk->vecls_cmd_matched = VECLS_CMD_NO_MATCH; + return; + } + sk->vecls_cmd_matched = VECLS_CMD_MATCHED; + } + if (sk->vecls_cmd_matched != VECLS_CMD_MATCHED) return; + cpu = raw_smp_processor_id(); rcu_read_lock(); tb = rcu_dereference(vecls_sock_flow_table); - hash = READ_ONCE(sk->sk_rxhash); + if (lo_numa_rps && skb && sk_is_loopback(sk)) + hash = READ_ONCE(skb->sym_hash); + else + hash = READ_ONCE(sk->sk_rxhash); if (tb && hash) { index = hash & tb->mask; val = hash & ~vecls_cpu_mask; @@ -185,7 +216,7 @@ static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, rflow->cpu = next_cpu; } -static int get_cpu_in_numa(int tcpu, u32 hash) +static inline int get_cpu_in_numa(int tcpu, u32 hash) { const struct cpumask *mask; int nr_cpus, cpu, index; @@ -249,7 +280,40 @@ static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); } -static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) +static inline void loopback_numa_rps(struct sk_buff *skb, int *rcpu) +{ + struct vecls_sock_flow_table *stb; + u32 last_recv_cpu, hash, val; + int newcpu, index; + + skb_reset_network_header(skb); + hash = __skb_get_hash_symmetric(skb); + if (!hash) + return; + + WRITE_ONCE(skb->sym_hash, hash); + rcu_read_lock(); + stb = rcu_dereference(vecls_sock_flow_table); + if (stb) { + val = READ_ONCE(stb->ents[hash & stb->mask]); + last_recv_cpu = val & vecls_cpu_mask; + } else { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + if ((val ^ hash) & ~vecls_cpu_mask) + return; + + newcpu = cpumask_first(cpumask_of_node(cpu_to_node(last_recv_cpu))); + index = rps_cpus[reciprocal_scale(hash, rps_cpus_nums - 1)]; + newcpu += index; + *rcpu = newcpu; + vecls_debug("last:%u curcpu:%d newcpu:%d\n", last_recv_cpu, raw_smp_processor_id(), newcpu); +} + +void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) { struct net_device *ndev = skb->dev; struct vecls_sock_flow_table *stb; @@ -262,6 +326,9 @@ static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) if (!ndev) return; + if (lo_numa_rps && (ndev->type == ARPHRD_LOOPBACK && ndev->flags & IFF_LOOPBACK)) + loopback_numa_rps(skb, cpu); + if (!is_vecls_config_netdev(ndev->name)) return; @@ -399,6 +466,7 @@ static int vecls_dev_flow_table_init(void) static const struct vecls_hook_ops vecls_flow_ops = { .vecls_flow_update = _vecls_flow_update, .vecls_set_cpu = _vecls_set_cpu, + .vecls_set_localcpu = NULL, .vecls_timeout = _vecls_timeout, .vecls_cfg_rxcls = NULL, }; @@ -415,6 +483,7 @@ static int vecls_sock_flow_table_release(void) mutex_unlock(&vecls_sock_flow_mutex); synchronize_rcu(); vfree(tb); + kfree(rps_cpus); return 0; } @@ -469,10 +538,20 @@ static int vecls_sock_flow_table_init(void) struct vecls_sock_flow_table *table; int size = sft_num, i; + rps_cpus_nums = cpumask_weight(cpumask_of_node(0)); + rps_cpus = kmalloc_array(rps_cpus_nums, sizeof(u16), GFP_KERNEL); + if (!rps_cpus) + return -ENOMEM; + for (i = 0; i < rps_cpus_nums; i++) + rps_cpus[i] = i; + vecls_debug("rps_cpus_nums:%d\n", rps_cpus_nums); + size = roundup_pow_of_two(size); table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size)); - if (!table) + if (!table) { + kfree(rps_cpus); return -ENOMEM; + } vecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; vecls_debug("nr_cpu_ids:%d, vecls_cpu_mask:0x%x\n", nr_cpu_ids, vecls_cpu_mask); @@ -499,7 +578,8 @@ int vecls_flow_res_init(void) if (err) goto clean; - RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); + if (mode != 0) //for lo rps + RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); synchronize_rcu(); return 0; diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index 30f7a2bd4570..e1c74b4b669c 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -8,6 +8,7 @@ #include <linux/proc_fs.h> #include <linux/rtnetlink.h> #include <linux/seq_file.h> +#include <linux/venetcls.h> #include "venetcls.h" int vecls_netdev_num; @@ -17,11 +18,15 @@ int vecls_numa_num; static int vecls_cluster_cpu_num, vecls_cluster_per_numa; static struct vecls_numa_info *vecls_numa_info_table; +int lo_numa_rps; +module_param(lo_numa_rps, int, 0644); +MODULE_PARM_DESC(lo_numa_rps, "enable loopback flow numa affinity"); + int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug switch"); -static int mode; +int mode; module_param(mode, int, 0444); MODULE_PARM_DESC(mode, "mode, default 0"); @@ -516,7 +521,8 @@ static int init_single_vecls_dev(char *if_name, unsigned int length) ret = vecls_filter_enable(dev_name, &old_state); if (ret) { vecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); - goto out; + if (lo_numa_rps) + goto out; } vecls_dev = alloc_vecls_netdev_info(); @@ -1111,10 +1117,15 @@ static __init int vecls_init(void) set_netdev_xps_queue(true); #endif - if (mode == 0) + if (mode == 0) { err = vecls_ntuple_res_init(); - else + if (err) + goto clean_rxq; + if (lo_numa_rps) + err = vecls_flow_res_init(); + } else { err = vecls_flow_res_init(); + } if (err) goto clean_rxq; @@ -1126,6 +1137,8 @@ static __init int vecls_init(void) goto clean_rxq; } #endif + if (lo_numa_rps) + static_branch_inc(&vecls_localrps_needed); return 0; @@ -1138,13 +1151,19 @@ static __init int vecls_init(void) static __exit void vecls_exit(void) { + if (lo_numa_rps) + static_branch_dec(&vecls_localrps_needed); + #ifdef CONFIG_PROC_FS remove_proc_entry("venet_status", init_net.proc_net); #endif - if (mode == 0) + if (mode == 0) { vecls_ntuple_res_clean(); - else + if (lo_numa_rps) + vecls_flow_res_clean(); + } else { vecls_flow_res_clean(); + } #ifdef CONFIG_XPS set_netdev_xps_queue(false); diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c index ad3c10f8ae5f..ac73d548fdee 100644 --- a/net/venetcls/venetcls_ntuple.c +++ b/net/venetcls/venetcls_ntuple.c @@ -681,7 +681,8 @@ int venetcls_ntuple_status(struct seq_file *seq, void *v) } static const struct vecls_hook_ops vecls_ntuple_ops = { - .vecls_flow_update = NULL, + .vecls_flow_update = _vecls_flow_update, + .vecls_set_localcpu = _vecls_set_cpu, .vecls_set_cpu = NULL, .vecls_timeout = NULL, .vecls_cfg_rxcls = ethtool_cfg_rxcls, -- 2.34.1
hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- Support rps affinity policy setting (0 as no rps, 1 as numa, 2 as cluster. Also use rps cpus map instead of traversation to improve performance. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- include/linux/venetcls.h | 6 +- net/venetcls/venetcls.h | 3 +- net/venetcls/venetcls_flow.c | 121 +++++++++++++++++++++------------ net/venetcls/venetcls_main.c | 22 +++--- net/venetcls/venetcls_ntuple.c | 4 +- 5 files changed, 100 insertions(+), 56 deletions(-) diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h index fab7e57fde89..acbffdb91ee8 100644 --- a/include/linux/venetcls.h +++ b/include/linux/venetcls.h @@ -52,9 +52,9 @@ venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) cpu = -1; last_qtail = 0; /* mode 1 always use vecls_set_cpu hook for physical NIC or lo. - * mode 0 set this hook to NULL, to avoid unneeded ops in - * venetcls_skblist_set_cpu() for physical NIC flows, and use - * vecls_set_localcpu hook for loopback flows. + * mode 0 set this hook to NULL if rps_policy is 0 , to avoid + * unneeded ops in venetcls_skblist_set_cpu() for physical NIC + * flows, and use vecls_set_localcpu hook for loopback flows. */ if (ops->vecls_set_cpu) ops->vecls_set_cpu(skb, &cpu, &last_qtail); diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h index 05fe2e3592f4..4313939e91d9 100644 --- a/net/venetcls/venetcls.h +++ b/net/venetcls/venetcls.h @@ -137,7 +137,8 @@ struct cfg_param { int cpu; }; -extern int lo_numa_rps; +extern int rps_policy; +extern int lo_rps_policy; extern int mode; extern int match_ip_flag; extern int debug; diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c index 85d1abe4c708..758067a7c6f1 100644 --- a/net/venetcls/venetcls_flow.c +++ b/net/venetcls/venetcls_flow.c @@ -14,9 +14,9 @@ #include "venetcls.h" -static u16 *rps_cpus; -static int rps_cpus_nums; static u32 vecls_cpu_mask; +static u16 *rps_cpus, *cluster_rps_cpus; +static int rps_cpus_nums, cluster_rps_cpus_nums; static struct vecls_sock_flow_table __rcu *vecls_sock_flow_table; static DEFINE_MUTEX(vecls_sock_flow_mutex); static DEFINE_SPINLOCK(vecls_dev_flow_lock); @@ -70,13 +70,16 @@ static bool _vecls_timeout(struct net_device *dev, u16 rxq_index, static inline bool sk_is_loopback(struct sock *sk) { if (sk->sk_family == AF_INET) { - if (ipv4_is_loopback(sk->sk_daddr) && ipv4_is_loopback(sk->sk_rcv_saddr)) + if (ipv4_is_loopback(sk->sk_daddr) || ipv4_is_loopback(sk->sk_rcv_saddr)) return true; } if (sk->sk_family == AF_INET6) { - if (ipv6_addr_loopback(&sk->sk_v6_daddr) && - ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + if (ipv6_addr_loopback(&sk->sk_v6_daddr) || + ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) || + ipv6_addr_v4mapped_loopback(&sk->sk_v6_daddr) || + ipv6_addr_v4mapped_loopback(&sk->sk_v6_rcv_saddr) || + ipv6_addr_equal(&sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr)) return true; } return false; @@ -104,7 +107,7 @@ void _vecls_flow_update(struct sock *sk, struct sk_buff *skb) cpu = raw_smp_processor_id(); rcu_read_lock(); tb = rcu_dereference(vecls_sock_flow_table); - if (lo_numa_rps && skb && sk_is_loopback(sk)) + if (lo_rps_policy && skb && sk_is_loopback(sk)) hash = READ_ONCE(skb->sym_hash); else hash = READ_ONCE(sk->sk_rxhash); @@ -216,37 +219,31 @@ static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, rflow->cpu = next_cpu; } -static inline int get_cpu_in_numa(int tcpu, u32 hash) +static inline u32 get_rps_cpu(u32 last_recv_cpu, u32 hash, int policy) { - const struct cpumask *mask; - int nr_cpus, cpu, index; - - mask = cpumask_of_node(cpu_to_node(tcpu)); - nr_cpus = cpumask_weight(mask); - if (nr_cpus == 0) - return -1; - - index = reciprocal_scale(hash, nr_cpus); - if (index < 0) - return -1; - - cpu = cpumask_first(mask); - while (--nr_cpus > 0) { - if (index == 0) - break; - cpu = cpumask_next(cpu, mask); - index--; + u32 newcpu, index; + + if (policy == 1) { + newcpu = cpumask_first(cpumask_of_node(cpu_to_node(last_recv_cpu))); + index = rps_cpus[reciprocal_scale(hash, rps_cpus_nums - 1)]; + newcpu += index; + } else if (policy == 2) { + newcpu = cpumask_first(topology_cluster_cpumask(last_recv_cpu)); + index = cluster_rps_cpus[reciprocal_scale(hash, cluster_rps_cpus_nums - 1)]; + newcpu += index; + } else { + newcpu = last_recv_cpu; } - return cpu; + + return newcpu; } static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, struct vecls_sock_flow_table *tb, struct vecls_dev_flow_table *dtb, int old_rxq_id, int *rcpu, int *last_qtail) { - u32 last_recv_cpu, hash, val, cpu, tcpu; + u32 last_recv_cpu, hash, val, cpu, tcpu, newcpu; struct vecls_dev_flow *rflow; - int newcpu; cpu = raw_smp_processor_id(); skb_reset_network_header(skb); @@ -262,11 +259,7 @@ static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, if ((val ^ hash) & ~vecls_cpu_mask) return; - newcpu = get_cpu_in_numa(last_recv_cpu, hash); - if (newcpu >= 0) - *rcpu = newcpu; - else - newcpu = last_recv_cpu; + newcpu = get_rps_cpu(last_recv_cpu, hash, rps_policy); if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(newcpu)) { rflow->timeout = jiffies; @@ -280,11 +273,10 @@ static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); } -static inline void loopback_numa_rps(struct sk_buff *skb, int *rcpu) +static inline void do_loopback_rps(struct sk_buff *skb, int *rcpu) { + u32 last_recv_cpu, hash, val, newcpu; struct vecls_sock_flow_table *stb; - u32 last_recv_cpu, hash, val; - int newcpu, index; skb_reset_network_header(skb); hash = __skb_get_hash_symmetric(skb); @@ -306,9 +298,36 @@ static inline void loopback_numa_rps(struct sk_buff *skb, int *rcpu) if ((val ^ hash) & ~vecls_cpu_mask) return; - newcpu = cpumask_first(cpumask_of_node(cpu_to_node(last_recv_cpu))); - index = rps_cpus[reciprocal_scale(hash, rps_cpus_nums - 1)]; - newcpu += index; + newcpu = get_rps_cpu(last_recv_cpu, hash, lo_rps_policy); + *rcpu = newcpu; + vecls_debug("last:%u curcpu:%d newcpu:%d\n", last_recv_cpu, raw_smp_processor_id(), newcpu); +} + +static inline void do_flow_rps(struct sk_buff *skb, int *rcpu) +{ + u32 last_recv_cpu, hash, val, newcpu; + struct vecls_sock_flow_table *stb; + + skb_reset_network_header(skb); + hash = skb_get_hash(skb); + if (!hash) + return; + + rcu_read_lock(); + stb = rcu_dereference(vecls_sock_flow_table); + if (stb) { + val = READ_ONCE(stb->ents[hash & stb->mask]); + last_recv_cpu = val & vecls_cpu_mask; + } else { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + if ((val ^ hash) & ~vecls_cpu_mask) + return; + + newcpu = get_rps_cpu(last_recv_cpu, hash, rps_policy); *rcpu = newcpu; vecls_debug("last:%u curcpu:%d newcpu:%d\n", last_recv_cpu, raw_smp_processor_id(), newcpu); } @@ -326,12 +345,17 @@ void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) if (!ndev) return; - if (lo_numa_rps && (ndev->type == ARPHRD_LOOPBACK && ndev->flags & IFF_LOOPBACK)) - loopback_numa_rps(skb, cpu); + if (lo_rps_policy && (ndev->type == ARPHRD_LOOPBACK && ndev->flags & IFF_LOOPBACK)) + do_loopback_rps(skb, cpu); if (!is_vecls_config_netdev(ndev->name)) return; + if (rps_policy && mode == 0) { + do_flow_rps(skb, cpu); + return; + } + rxqueue = ndev->_rx; if (skb_rx_queue_recorded(skb)) { rxq_id = skb_get_rx_queue(skb); @@ -484,6 +508,7 @@ static int vecls_sock_flow_table_release(void) synchronize_rcu(); vfree(tb); kfree(rps_cpus); + kfree(cluster_rps_cpus); return 0; } @@ -538,18 +563,30 @@ static int vecls_sock_flow_table_init(void) struct vecls_sock_flow_table *table; int size = sft_num, i; + cluster_rps_cpus_nums = cpumask_weight(topology_cluster_cpumask(0)); rps_cpus_nums = cpumask_weight(cpumask_of_node(0)); rps_cpus = kmalloc_array(rps_cpus_nums, sizeof(u16), GFP_KERNEL); if (!rps_cpus) return -ENOMEM; for (i = 0; i < rps_cpus_nums; i++) rps_cpus[i] = i; - vecls_debug("rps_cpus_nums:%d\n", rps_cpus_nums); + + cluster_rps_cpus = kmalloc_array(cluster_rps_cpus_nums, sizeof(u16), GFP_KERNEL); + if (!cluster_rps_cpus) { + kfree(rps_cpus); + return -ENOMEM; + } + for (i = 0; i < cluster_rps_cpus_nums; i++) + cluster_rps_cpus[i] = i; + + vecls_debug("rps_cpus_nums:%d cluster_rps_cpus_nums:%d\n", + rps_cpus_nums, cluster_rps_cpus_nums); size = roundup_pow_of_two(size); table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size)); if (!table) { kfree(rps_cpus); + kfree(cluster_rps_cpus); return -ENOMEM; } diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index e1c74b4b669c..00ec0b0e2498 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -18,9 +18,13 @@ int vecls_numa_num; static int vecls_cluster_cpu_num, vecls_cluster_per_numa; static struct vecls_numa_info *vecls_numa_info_table; -int lo_numa_rps; -module_param(lo_numa_rps, int, 0644); -MODULE_PARM_DESC(lo_numa_rps, "enable loopback flow numa affinity"); +int rps_policy = 1; +module_param(rps_policy, int, 0644); +MODULE_PARM_DESC(rps_policy, "phy nic rps policy, default 1"); + +int lo_rps_policy; +module_param(lo_rps_policy, int, 0644); +MODULE_PARM_DESC(lo_rps_policy, "loopback rps policy, default 0"); int debug; module_param(debug, int, 0644); @@ -56,7 +60,7 @@ MODULE_PARM_DESC(irqname, "nic irq name string, default comp"); unsigned int dft_num = 0x1000; module_param(dft_num, uint, 0444); -MODULE_PARM_DESC(dft_num, "dev flow table entries, default 0x10000"); +MODULE_PARM_DESC(dft_num, "dev flow table entries, default 0x1000"); unsigned int sft_num = 0x100000; module_param(sft_num, uint, 0444); @@ -521,7 +525,7 @@ static int init_single_vecls_dev(char *if_name, unsigned int length) ret = vecls_filter_enable(dev_name, &old_state); if (ret) { vecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); - if (lo_numa_rps) + if (lo_rps_policy) goto out; } @@ -1121,7 +1125,7 @@ static __init int vecls_init(void) err = vecls_ntuple_res_init(); if (err) goto clean_rxq; - if (lo_numa_rps) + if (lo_rps_policy) err = vecls_flow_res_init(); } else { err = vecls_flow_res_init(); @@ -1137,7 +1141,7 @@ static __init int vecls_init(void) goto clean_rxq; } #endif - if (lo_numa_rps) + if (lo_rps_policy) static_branch_inc(&vecls_localrps_needed); return 0; @@ -1151,7 +1155,7 @@ static __init int vecls_init(void) static __exit void vecls_exit(void) { - if (lo_numa_rps) + if (lo_rps_policy) static_branch_dec(&vecls_localrps_needed); #ifdef CONFIG_PROC_FS @@ -1159,7 +1163,7 @@ static __exit void vecls_exit(void) #endif if (mode == 0) { vecls_ntuple_res_clean(); - if (lo_numa_rps) + if (lo_rps_policy) vecls_flow_res_clean(); } else { vecls_flow_res_clean(); diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c index ac73d548fdee..8fc5d8da06fe 100644 --- a/net/venetcls/venetcls_ntuple.c +++ b/net/venetcls/venetcls_ntuple.c @@ -680,7 +680,7 @@ int venetcls_ntuple_status(struct seq_file *seq, void *v) return 0; } -static const struct vecls_hook_ops vecls_ntuple_ops = { +static struct vecls_hook_ops vecls_ntuple_ops = { .vecls_flow_update = _vecls_flow_update, .vecls_set_localcpu = _vecls_set_cpu, .vecls_set_cpu = NULL, @@ -697,6 +697,8 @@ int vecls_ntuple_res_init(void) } init_vecls_sk_rules(); + if (rps_policy) + vecls_ntuple_ops.vecls_set_cpu = _vecls_set_cpu; RCU_INIT_POINTER(vecls_ops, &vecls_ntuple_ops); synchronize_rcu(); return 0; -- 2.34.1
hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- Initialize flow table while rps_policy enabled, and add vecls_flow_enabled static key to optimize performance. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- include/linux/venetcls.h | 3 +-- net/core/dev.c | 14 ++++++++++---- net/venetcls/venetcls_flow.c | 22 +++++++++++++--------- net/venetcls/venetcls_main.c | 9 +++++---- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h index acbffdb91ee8..fdafe47e8f9f 100644 --- a/include/linux/venetcls.h +++ b/include/linux/venetcls.h @@ -16,6 +16,7 @@ struct vecls_hook_ops { typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); extern const struct vecls_hook_ops __rcu *vecls_ops; extern struct static_key_false vecls_localrps_needed; +extern struct static_key_false vecls_flow_enabled; static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) { @@ -75,8 +76,6 @@ venetcls_skb_set_localcpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) struct net_device *dev = skb->dev; bool result = false; - if (!static_branch_unlikely(&vecls_localrps_needed)) - return result; if (!dev || !(dev->type == ARPHRD_LOOPBACK && dev->flags & IFF_LOOPBACK)) return result; diff --git a/net/core/dev.c b/net/core/dev.c index b62fcd0a6daf..10445e98c8a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -166,6 +166,8 @@ const struct vecls_hook_ops __rcu *vecls_ops __read_mostly; EXPORT_SYMBOL_GPL(vecls_ops); struct static_key_false vecls_localrps_needed __read_mostly; EXPORT_SYMBOL(vecls_localrps_needed); +struct static_key_false vecls_flow_enabled __read_mostly; +EXPORT_SYMBOL(vecls_flow_enabled); #endif static DEFINE_SPINLOCK(ptype_lock); @@ -5889,9 +5891,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); #if IS_ENABLED(CONFIG_VENETCLS) - if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { - rcu_read_unlock(); - return ret; + if (static_branch_unlikely(&vecls_flow_enabled)) { + if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { + rcu_read_unlock(); + return ret; + } } #endif #ifdef CONFIG_RPS @@ -5927,7 +5931,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) rcu_read_lock(); #if IS_ENABLED(CONFIG_VENETCLS) - venetcls_skblist_set_cpu(head, enqueue_to_backlog); + if (static_branch_unlikely(&vecls_flow_enabled)) { + venetcls_skblist_set_cpu(head, enqueue_to_backlog); + } #endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c index 758067a7c6f1..9562dc9ae03c 100644 --- a/net/venetcls/venetcls_flow.c +++ b/net/venetcls/venetcls_flow.c @@ -122,13 +122,13 @@ void _vecls_flow_update(struct sock *sk, struct sk_buff *skb) rcu_read_unlock(); } -static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb) +static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb, u32 hash) { struct vecls_numa_bound_dev_info *bound_dev = NULL; struct vecls_netdev_info *vecls_dev; struct vecls_numa_info *numa_info; int i, devid, rxq_num, rxq_id; - u32 hash, index; + u32 index; numa_info = get_vecls_numa_info(nid); if (!numa_info) @@ -154,7 +154,6 @@ static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *s } if (rxq_num == 0) return -1; - hash = skb_get_hash(skb); index = hash % rxq_num; i = 0; @@ -170,19 +169,19 @@ static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *s } static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, - struct vecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu) + struct vecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu, u32 hash) { struct netdev_rx_queue *rxqueue; struct vecls_dev_flow_table *dtb; struct vecls_dev_flow *rflow; - u32 flow_id, hash; int rxq_index, rc; + u32 flow_id; if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) return; - rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); + rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb, hash); if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) { vecls_debug("%s skb:%p, old_rxq:%d, next_cpu:%d new_rxq:%d\n", __func__, skb, old_rxq_id, next_cpu, rxq_index); @@ -194,7 +193,6 @@ static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, if (!dtb) return; - hash = skb_get_hash(skb); flow_id = hash & dtb->mask; rflow = &dtb->flows[flow_id]; @@ -245,7 +243,6 @@ static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, u32 last_recv_cpu, hash, val, cpu, tcpu, newcpu; struct vecls_dev_flow *rflow; - cpu = raw_smp_processor_id(); skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) @@ -260,17 +257,22 @@ static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, return; newcpu = get_rps_cpu(last_recv_cpu, hash, rps_policy); + if (rps_policy) + *rcpu = newcpu; + vecls_debug("last:%u curcpu:%d newcpu:%d rcpu:%d\n", + last_recv_cpu, raw_smp_processor_id(), newcpu, *rcpu); if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(newcpu)) { rflow->timeout = jiffies; return; } + cpu = raw_smp_processor_id(); if (cpu_to_node(cpu) == cpu_to_node(newcpu)) return; if (tcpu >= nr_cpu_ids) - set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); + set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu, hash); } static inline void do_loopback_rps(struct sk_buff *skb, int *rcpu) @@ -618,6 +620,7 @@ int vecls_flow_res_init(void) if (mode != 0) //for lo rps RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); synchronize_rcu(); + static_branch_inc(&vecls_flow_enabled); return 0; clean: @@ -627,6 +630,7 @@ int vecls_flow_res_init(void) void vecls_flow_res_clean(void) { + static_branch_dec(&vecls_flow_enabled); RCU_INIT_POINTER(vecls_ops, NULL); synchronize_rcu(); vecls_sock_flow_table_release(); diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index 00ec0b0e2498..2072e330941e 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -5,6 +5,7 @@ #include <linux/ethtool.h> #include <linux/irq.h> #include <linux/irqdesc.h> +#include <linux/minmax.h> #include <linux/proc_fs.h> #include <linux/rtnetlink.h> #include <linux/seq_file.h> @@ -495,7 +496,7 @@ static void vecls_filter_restore(const char *dev_name, bool old_state) } } -static int init_single_vecls_dev(char *if_name, unsigned int length) +static int init_single_vecls_dev(char *if_name, size_t length) { struct vecls_netdev_info *vecls_dev; char dev_name[IFNAMSIZ] = { 0 }; @@ -503,7 +504,7 @@ static int init_single_vecls_dev(char *if_name, unsigned int length) bool old_state = false; int ret; - strscpy(dev_name, if_name, IFNAMSIZ); + strscpy(dev_name, if_name, min_t(size_t, length + 1, IFNAMSIZ)); netdev = dev_get_by_name(&init_net, dev_name); if (!netdev) { vecls_error("dev [%s] is not exist!\n", dev_name); @@ -1125,7 +1126,7 @@ static __init int vecls_init(void) err = vecls_ntuple_res_init(); if (err) goto clean_rxq; - if (lo_rps_policy) + if (lo_rps_policy || rps_policy) err = vecls_flow_res_init(); } else { err = vecls_flow_res_init(); @@ -1163,7 +1164,7 @@ static __exit void vecls_exit(void) #endif if (mode == 0) { vecls_ntuple_res_clean(); - if (lo_rps_policy) + if (lo_rps_policy || rps_policy) vecls_flow_res_clean(); } else { vecls_flow_res_clean(); -- 2.34.1
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9102 -------------------------------- Only use NIC current hardware channels for flow steer. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> Signed-off-by: Li Xiasong <lixiasong1@huawei.com> --- net/venetcls/venetcls_main.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index 2072e330941e..fefdbd7d552a 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -397,8 +397,20 @@ static bool check_irq_name(const char *irq_name, struct vecls_netdev_info *vecls static void get_netdev_queue_info(struct vecls_netdev_info *vecls_dev) { struct vecls_netdev_queue_info *rxq_info; + struct ethtool_channels echannels = {0}; + int irq, cpu, ret, combined_channels; + struct cmd_context ctx = {0}; struct irq_desc *desc; - int irq, cpu; + + strscpy(ctx.netdev, vecls_dev->dev_name, IFNAMSIZ); + echannels.cmd = ETHTOOL_GCHANNELS; + ret = send_ethtool_ioctl(&ctx, &echannels); + if (ret) { + vecls_error("get %s channels fail ret:%d\n", vecls_dev->dev_name, ret); + return; + } + + combined_channels = echannels.combined_count; for_each_irq_desc(irq, desc) { if (!desc->action) @@ -409,6 +421,10 @@ static void get_netdev_queue_info(struct vecls_netdev_info *vecls_dev) continue; if (vecls_dev->rxq_num >= VECLS_MAX_RXQ_NUM_PER_DEV) break; + if (vecls_dev->rxq_num > combined_channels - 1) + break; + vecls_debug("rxq_num:%d channels:%d\n", vecls_dev->rxq_num, combined_channels); + rxq_info = &vecls_dev->rxq[vecls_dev->rxq_num++]; rxq_info->irq = irq; cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data)); -- 2.34.1
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9102 -------------------------------- Make VENETCLS depend on MODULES and use it default as module. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> Signed-off-by: Li Xiasong <lixiasong1@huawei.com> --- net/venetcls/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/venetcls/Kconfig b/net/venetcls/Kconfig index 7ba9b35f1623..99d36ea5daff 100644 --- a/net/venetcls/Kconfig +++ b/net/venetcls/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config VENETCLS tristate "Network classification" - depends on MODULES && ARM64 + depends on MODULES && ARM64 && m default m help This introduces a kind of network optimization method, which can -- 2.34.1
From: Li Xiasong <lixiasong1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9175 -------------------------------- clean_vecls_sk_rules() releases vecls_sk_rule objects, but leaves vecls_sk_entry objects in vecls_sk_list allocated. Free vecls_sk_list entries during cleanup before walking rule buckets, so both indexes are released on teardown. Signed-off-by: Li Xiasong <lixiasong1@huawei.com> Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls_ntuple.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c index 8fc5d8da06fe..c770fa5e69de 100644 --- a/net/venetcls/venetcls_ntuple.c +++ b/net/venetcls/venetcls_ntuple.c @@ -620,6 +620,7 @@ static void clean_vecls_sk_rules(void) { struct vecls_netdev_info *vecls_dev; struct cmd_context ctx = { 0 }; + struct vecls_sk_entry *entry; struct vecls_sk_rule *rule; struct hlist_head *hlist; struct hlist_node *n; @@ -627,6 +628,15 @@ static void clean_vecls_sk_rules(void) int err; mutex_lock(&vecls_sk_rules.mutex); + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { + hlist = &vecls_sk_list.hash[i]; + + hlist_for_each_entry_safe(entry, n, hlist, node) { + hlist_del_init(&entry->node); + kfree(entry); + } + } + for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { hlist = &vecls_sk_rules.hash[i]; @@ -640,7 +650,7 @@ static void clean_vecls_sk_rules(void) vecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, rule->devid, rule->action, rule->ruleid, err); - hlist_del(&rule->node); + hlist_del_init(&rule->node); vecls_debug("clean rule=%p\n", rule); kfree(rule); } -- 2.34.1
From: Li Xiasong <lixiasong1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9102 -------------------------------- get_rps_cpu() uses reciprocal_scale(hash, cpus_num - 1) when selecting RPS CPU indices for both NUMA and cluster policies. This excludes the last bucket from selection and introduces a persistent distribution bias. Use the full candidate count in reciprocal_scale() so all candidate CPUs can be selected uniformly, and keep boundary checks for invalid counts. Signed-off-by: Li Xiasong <lixiasong1@huawei.com> Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls_flow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c index 9562dc9ae03c..bf8ce8a20d13 100644 --- a/net/venetcls/venetcls_flow.c +++ b/net/venetcls/venetcls_flow.c @@ -223,11 +223,11 @@ static inline u32 get_rps_cpu(u32 last_recv_cpu, u32 hash, int policy) if (policy == 1) { newcpu = cpumask_first(cpumask_of_node(cpu_to_node(last_recv_cpu))); - index = rps_cpus[reciprocal_scale(hash, rps_cpus_nums - 1)]; + index = rps_cpus[reciprocal_scale(hash, rps_cpus_nums)]; newcpu += index; } else if (policy == 2) { newcpu = cpumask_first(topology_cluster_cpumask(last_recv_cpu)); - index = cluster_rps_cpus[reciprocal_scale(hash, cluster_rps_cpus_nums - 1)]; + index = cluster_rps_cpus[reciprocal_scale(hash, cluster_rps_cpus_nums)]; newcpu += index; } else { newcpu = last_recv_cpu; -- 2.34.1
From: Li Xiasong <lixiasong1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9102 -------------------------------- In mode 0, vecls_ntuple_res_init() may succeed before vecls_flow_res_init(). If flow init fails, ntuple resources are not released on the error path. Add a dedicated cleanup label and call vecls_ntuple_res_clean() before falling through the common teardown path, so init rollback is complete. Signed-off-by: Li Xiasong <lixiasong1@huawei.com> Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index fefdbd7d552a..ab7808f6a955 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -1142,8 +1142,11 @@ static __init int vecls_init(void) err = vecls_ntuple_res_init(); if (err) goto clean_rxq; - if (lo_rps_policy || rps_policy) + if (lo_rps_policy || rps_policy) { err = vecls_flow_res_init(); + if (err) + goto clean_ntuple; + } } else { err = vecls_flow_res_init(); } @@ -1163,6 +1166,8 @@ static __init int vecls_init(void) return 0; +clean_ntuple: + vecls_ntuple_res_clean(); clean_rxq: clean_numa: clean_vecls_netdev_info(); -- 2.34.1
From: Li Xiasong <lixiasong1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9201 -------------------------------- venetcls currently accepts dft_num/sft_num == 0. A zero table size can reach roundup_pow_of_two(0), whose result is undefined, and then derive an invalid hash mask while allocating zero-entry tables. This may lead to out-of-bounds access in flow lookup/update paths. Reject zero dft_num/sft_num in check_params(), and emit explicit errors for invalid mode/ifname/table-size values to make module init failures diagnosable. Signed-off-by: Li Xiasong <lixiasong1@huawei.com> Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls_main.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index ab7808f6a955..fd466597ed7d 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -69,12 +69,25 @@ MODULE_PARM_DESC(sft_num, "sock flow table entries, default 0x100000"); static bool check_params(void) { - if (mode != 0 && mode != 1) + if (mode != 0 && mode != 1) { + vecls_error("invalid mode=%d, expected 0/1\n", mode); return false; + } - if (strlen(ifname) == 0) + if (strlen(ifname) == 0) { + vecls_error("invalid ifname, empty string\n"); return false; + } + if (dft_num == 0) { + vecls_error("invalid dft_num=%u, must be > 0\n", dft_num); + return false; + } + + if (sft_num == 0) { + vecls_error("invalid sft_num=%u, must be > 0\n", sft_num); + return false; + } return true; } -- 2.34.1
From: Li Xiasong <lixiasong1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9198 -------------------------------- Tuple resolution for ntuple rules may require netdevice address lookup under rtnl protection. Doing this in the listen-side path causes locking context issues and may trigger invalid lock usage from RCU-related context. Move tuple resolution to cfg_work, and only queue a socket snapshot from the listen-side path. This keeps the locking in a sleepable worker context while preserving the existing async rule programming flow. Signed-off-by: Li Xiasong <lixiasong1@huawei.com> Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls.h | 7 ++++++ net/venetcls/venetcls_ntuple.c | 46 ++++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h index 4313939e91d9..83e2f9baa228 100644 --- a/net/venetcls/venetcls.h +++ b/net/venetcls/venetcls.h @@ -132,6 +132,13 @@ struct cfg_param { struct work_struct work; struct cmd_context ctx; struct sock *sk; + struct { + struct net *net; + u16 family; + u16 lport; + __be32 rcv_saddr_v4; + struct in6_addr rcv_saddr_v6; + } sk_snapshot; bool is_del; int nid; int cpu; diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c index c770fa5e69de..6044465d0bba 100644 --- a/net/venetcls/venetcls_ntuple.c +++ b/net/venetcls/venetcls_ntuple.c @@ -211,33 +211,34 @@ static void get_first_ip6_addr(struct net *net, u32 *dip6) rtnl_unlock(); } -static void get_sk_rule_addr(struct sock *sk, struct cfg_param *ctx_p) +static void get_sk_rule_addr(struct cfg_param *ctx_p) { - bool is_ipv6 = !!(sk->sk_family == AF_INET6); + bool is_ipv6 = !!(ctx_p->sk_snapshot.family == AF_INET6); u16 *dport = &ctx_p->ctx.dport; u32 *dip4 = &ctx_p->ctx.dip4; u32 *dip6 = &ctx_p->ctx.dip6[0]; - *dport = htons(sk->sk_num); + *dport = htons(ctx_p->sk_snapshot.lport); ctx_p->ctx.is_ipv6 = is_ipv6; if (!match_ip_flag) { *dip4 = 0; - memset(dip6, 0, sizeof(sk->sk_v6_rcv_saddr)); + memset(dip6, 0, sizeof(ctx_p->sk_snapshot.rcv_saddr_v6)); return; } if (is_ipv6) { - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - memcpy(dip6, &sk->sk_v6_rcv_saddr, sizeof(sk->sk_v6_rcv_saddr)); + if (!ipv6_addr_any(&ctx_p->sk_snapshot.rcv_saddr_v6)) + memcpy(dip6, &ctx_p->sk_snapshot.rcv_saddr_v6, + sizeof(ctx_p->sk_snapshot.rcv_saddr_v6)); else - get_first_ip6_addr(sock_net(sk), dip6); + get_first_ip6_addr(ctx_p->sk_snapshot.net, dip6); } else { - if (sk->sk_rcv_saddr) - *dip4 = sk->sk_rcv_saddr; + if (ctx_p->sk_snapshot.rcv_saddr_v4) + *dip4 = ctx_p->sk_snapshot.rcv_saddr_v4; else - *dip4 = get_first_ip4_addr(sock_net(sk)); + *dip4 = get_first_ip4_addr(ctx_p->sk_snapshot.net); } } @@ -476,6 +477,8 @@ static void cfg_work(struct work_struct *work) struct vecls_sk_rule *rule; int devid, rxq_id, err; + get_sk_rule_addr(ctx_p); + mutex_lock(&vecls_sk_rules.mutex); for (devid = 0; devid < vecls_netdev_num; devid++) { vecls_dev = get_vecls_netdev_info(devid); @@ -527,6 +530,7 @@ static void cfg_work(struct work_struct *work) } } mutex_unlock(&vecls_sk_rules.mutex); + put_net(ctx_p->sk_snapshot.net); kfree(ctx_p); atomic_dec(&vecls_worker_count); } @@ -558,10 +562,15 @@ static void del_ntuple_rule(struct sock *sk) ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); if (!ctx_p) return; - get_sk_rule_addr(sk, ctx_p); ctx_p->is_del = true; ctx_p->sk = sk; + ctx_p->sk_snapshot.net = get_net(sock_net(sk)); + ctx_p->sk_snapshot.family = sk->sk_family; + ctx_p->sk_snapshot.lport = sk->sk_num; + ctx_p->sk_snapshot.rcv_saddr_v4 = sk->sk_rcv_saddr; + ctx_p->sk_snapshot.rcv_saddr_v6 = sk->sk_v6_rcv_saddr; + INIT_WORK(&ctx_p->work, cfg_work); queue_work(do_cfg_workqueue, &ctx_p->work); atomic_inc(&vecls_worker_count); @@ -570,8 +579,7 @@ static void del_ntuple_rule(struct sock *sk) static void add_ntuple_rule(struct sock *sk) { struct cfg_param *ctx_p; - int cpu = raw_smp_processor_id(); - int nid = cpu_to_node(cpu); + int cpu; if (check_appname(current->comm)) return; @@ -579,12 +587,18 @@ static void add_ntuple_rule(struct sock *sk) ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); if (!ctx_p) return; - get_sk_rule_addr(sk, ctx_p); + cpu = raw_smp_processor_id(); ctx_p->is_del = false; - ctx_p->sk = sk; - ctx_p->nid = nid; ctx_p->cpu = cpu; + ctx_p->nid = cpu_to_node(cpu); + ctx_p->sk = sk; + ctx_p->sk_snapshot.net = get_net(sock_net(sk)); + ctx_p->sk_snapshot.family = sk->sk_family; + ctx_p->sk_snapshot.lport = sk->sk_num; + ctx_p->sk_snapshot.rcv_saddr_v4 = sk->sk_rcv_saddr; + ctx_p->sk_snapshot.rcv_saddr_v6 = sk->sk_v6_rcv_saddr; + INIT_WORK(&ctx_p->work, cfg_work); queue_work(do_cfg_workqueue, &ctx_p->work); atomic_inc(&vecls_worker_count); -- 2.34.1
From: Li Xiasong <lixiasong1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9244 -------------------------------- Rule add/delete is handled asynchronously and delete can be queued before the corresponding add work has committed the rule state. This may cause delete to be skipped by an early "rule not found" check. Move the existence check into cfg_work under vecls_sk_rules.mutex, so the decision is made in the same serialized worker context as rule updates. This avoids dropping valid delete requests during add/delete reordering. Signed-off-by: Li Xiasong <lixiasong1@huawei.com> Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls_ntuple.c | 44 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c index 6044465d0bba..22539a0e559a 100644 --- a/net/venetcls/venetcls_ntuple.c +++ b/net/venetcls/venetcls_ntuple.c @@ -147,6 +147,23 @@ static struct vecls_sk_rule *get_rule_from_sk(int devid, void *sk) return rule; } +static bool has_sock_rule(struct sock *sk) +{ + struct vecls_netdev_info *vecls_dev; + struct vecls_sk_rule *rule; + int devid; + + for (devid = 0; devid < vecls_netdev_num; devid++) { + vecls_dev = get_vecls_netdev_info(devid); + if (!vecls_dev) + continue; + rule = get_rule_from_sk(devid, sk); + if (rule) + return true; + } + return false; +} + static inline bool reuseport_check(int devid, struct cmd_context ctx) { return !!get_sk_rule(devid, ctx); @@ -477,6 +494,11 @@ static void cfg_work(struct work_struct *work) struct vecls_sk_rule *rule; int devid, rxq_id, err; + mutex_lock(&vecls_sk_rules.mutex); + if (ctx_p->is_del && !has_sock_rule(ctx_p->sk)) + goto out_lock; + mutex_unlock(&vecls_sk_rules.mutex); + get_sk_rule_addr(ctx_p); mutex_lock(&vecls_sk_rules.mutex); @@ -529,36 +551,18 @@ static void cfg_work(struct work_struct *work) del_sk_rule(rule); } } + +out_lock: mutex_unlock(&vecls_sk_rules.mutex); put_net(ctx_p->sk_snapshot.net); kfree(ctx_p); atomic_dec(&vecls_worker_count); } -static bool has_sock_rule(struct sock *sk) -{ - struct vecls_netdev_info *vecls_dev; - struct vecls_sk_rule *rule; - int devid; - - for (devid = 0; devid < vecls_netdev_num; devid++) { - vecls_dev = get_vecls_netdev_info(devid); - if (!vecls_dev) - continue; - rule = get_rule_from_sk(devid, sk); - if (rule) - return true; - } - return false; -} - static void del_ntuple_rule(struct sock *sk) { struct cfg_param *ctx_p; - if (!has_sock_rule(sk)) - return; - ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); if (!ctx_p) return; -- 2.34.1
From: Li Xiasong <lixiasong1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9245 -------------------------------- rps_policy and lo_rps_policy are not safe to change at runtime because their dependent resources are not fully initialized/teared down for dynamic switching. Runtime updates can lead to use/cleanup path issues. Make both module parameters read-only to prevent unsupported dynamic reconfiguration and keep init/exit behavior consistent. Signed-off-by: Li Xiasong <lixiasong1@huawei.com> Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- net/venetcls/venetcls_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index fd466597ed7d..02b9a62a9fa8 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -20,11 +20,11 @@ static int vecls_cluster_cpu_num, vecls_cluster_per_numa; static struct vecls_numa_info *vecls_numa_info_table; int rps_policy = 1; -module_param(rps_policy, int, 0644); +module_param(rps_policy, int, 0444); MODULE_PARM_DESC(rps_policy, "phy nic rps policy, default 1"); int lo_rps_policy; -module_param(lo_rps_policy, int, 0644); +module_param(lo_rps_policy, int, 0444); MODULE_PARM_DESC(lo_rps_policy, "loopback rps policy, default 0"); int debug; -- 2.34.1
participants (2)
-
patchwork bot -
Yue Haibing