hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- Use NUMA-aware flow tables for local flows to achieve better cache effectiveness and NUMA affinity. Also cache check_appname results in sk to avoid unnecessary dup check. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- include/linux/skbuff.h | 3 ++ include/linux/venetcls.h | 40 ++++++++++++-- include/net/sock.h | 3 ++ net/core/dev.c | 26 +++++---- net/core/sock.c | 3 ++ net/ipv4/tcp.c | 5 +- net/venetcls/venetcls.h | 8 +++ net/venetcls/venetcls_flow.c | 98 ++++++++++++++++++++++++++++++---- net/venetcls/venetcls_main.c | 31 ++++++++--- net/venetcls/venetcls_ntuple.c | 3 +- 10 files changed, 189 insertions(+), 31 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fb589a653837..d781ee4b8a48 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -958,6 +958,9 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; +#if IS_ENABLED(CONFIG_VENETCLS) + __u32 sym_hash; +#endif __be16 vlan_proto; __u16 vlan_tci; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h index 9cfcdd4e5766..fab7e57fde89 100644 --- a/include/linux/venetcls.h +++ b/include/linux/venetcls.h @@ -2,16 +2,20 @@ #ifndef _LINUX_VENETCLS_H #define _LINUX_VENETCLS_H +#include <linux/if_arp.h> + struct vecls_hook_ops { void (*vecls_cfg_rxcls)(struct sock *sk, int is_del); - void (*vecls_flow_update)(struct sock *sk); + void (*vecls_flow_update)(struct sock *sk, struct sk_buff *skb); void (*vecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail); + void (*vecls_set_localcpu)(struct sk_buff *skb, int *cpu, int *last_qtail); bool (*vecls_timeout)(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); }; typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); extern const struct vecls_hook_ops __rcu *vecls_ops; +extern struct static_key_false vecls_localrps_needed; static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) { @@ -24,14 +28,14 @@ static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) rcu_read_unlock(); } -static inline void venetcls_flow_update(struct sock *sk) +static inline void venetcls_flow_update(struct sock *sk, struct sk_buff *skb) { const struct vecls_hook_ops *ops; rcu_read_lock(); ops = rcu_dereference(vecls_ops); if (ops && ops->vecls_flow_update) - ops->vecls_flow_update(sk); + ops->vecls_flow_update(sk, skb); rcu_read_unlock(); } @@ -44,10 +48,18 @@ venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) rcu_read_lock(); ops = rcu_dereference(vecls_ops); - if (ops && ops->vecls_set_cpu) { + if (ops) { cpu = -1; last_qtail = 0; - ops->vecls_set_cpu(skb, &cpu, &last_qtail); + /* mode 1 always use vecls_set_cpu hook for physical NIC or lo. + * mode 0 set this hook to NULL, to avoid unneeded ops in + * venetcls_skblist_set_cpu() for physical NIC flows, and use + * vecls_set_localcpu hook for loopback flows. + */ + if (ops->vecls_set_cpu) + ops->vecls_set_cpu(skb, &cpu, &last_qtail); + else if (ops->vecls_set_localcpu) + ops->vecls_set_localcpu(skb, &cpu, &last_qtail); if (cpu >= 0) { *ret = enq_func(skb, cpu, &last_qtail); result = true; @@ -57,6 +69,24 @@ venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) return result; } +static inline bool +venetcls_skb_set_localcpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) +{ + struct net_device *dev = skb->dev; + bool result = false; + + if (!static_branch_unlikely(&vecls_localrps_needed)) + return result; + if (!dev || !(dev->type == ARPHRD_LOOPBACK && dev->flags & IFF_LOOPBACK)) + return result; + + preempt_disable(); + if (venetcls_skb_set_cpu(skb, enq_func, ret)) + result = true; + preempt_enable(); + return result; +} + static inline void venetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func) { diff --git a/include/net/sock.h b/include/net/sock.h index 7fca7acb7d30..161e3e53ff72 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -537,6 +537,9 @@ struct sock { #endif struct rcu_head sk_rcu; struct xarray sk_pagepool; +#if IS_ENABLED(CONFIG_VENETCLS) + u8 vecls_cmd_matched; +#endif }; enum sk_pacing { diff --git a/net/core/dev.c b/net/core/dev.c index 47b916ca8d46..b62fcd0a6daf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -164,6 +164,8 @@ #include <linux/venetcls.h> const struct vecls_hook_ops __rcu *vecls_ops __read_mostly; EXPORT_SYMBOL_GPL(vecls_ops); +struct static_key_false vecls_localrps_needed __read_mostly; +EXPORT_SYMBOL(vecls_localrps_needed); #endif static DEFINE_SPINLOCK(ptype_lock); @@ -5199,6 +5201,12 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); +#if IS_ENABLED(CONFIG_VENETCLS) + if (static_branch_unlikely(&vecls_localrps_needed)) { + if (venetcls_skb_set_localcpu(skb, enqueue_to_backlog, &ret)) + return ret; + } +#endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -5880,6 +5888,12 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return NET_RX_SUCCESS; rcu_read_lock(); +#if IS_ENABLED(CONFIG_VENETCLS) + if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { + rcu_read_unlock(); + return ret; + } +#endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -5891,12 +5905,6 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } } -#endif -#if IS_ENABLED(CONFIG_VENETCLS) - if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { - rcu_read_unlock(); - return ret; - } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); @@ -5918,6 +5926,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) list_splice_init(&sublist, head); rcu_read_lock(); +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_skblist_set_cpu(head, enqueue_to_backlog); +#endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { @@ -5931,9 +5942,6 @@ static void netif_receive_skb_list_internal(struct list_head *head) } } } -#endif -#if IS_ENABLED(CONFIG_VENETCLS) - venetcls_skblist_set_cpu(head, enqueue_to_backlog); #endif __netif_receive_skb_list(head); rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index d104194eef9a..748f0f4e1115 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2118,6 +2118,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); +#if IS_ENABLED(CONFIG_VENETCLS) + sk->vecls_cmd_matched = 0; +#endif } return sk; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cc84873cee0d..48686eef0768 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2900,6 +2900,9 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (used + offset < skb->len) continue; +#if IS_ENABLED(CONFIG_VENETCLS) + venetcls_flow_update(sk, skb); +#endif if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) @@ -2944,7 +2947,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, return inet_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_VENETCLS) - venetcls_flow_update(sk); + venetcls_flow_update(sk, NULL); #endif if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h index 957645e28acf..05fe2e3592f4 100644 --- a/net/venetcls/venetcls.h +++ b/net/venetcls/venetcls.h @@ -15,6 +15,10 @@ #define RXQ_MAX_USECNT 0xFF +#define VECLS_CMD_UNKNOWN 0 +#define VECLS_CMD_MATCHED 1 +#define VECLS_CMD_NO_MATCH 2 + struct vecls_netdev_queue_info { int irq; int affinity_cpu; @@ -133,6 +137,8 @@ struct cfg_param { int cpu; }; +extern int lo_numa_rps; +extern int mode; extern int match_ip_flag; extern int debug; extern int vecls_netdev_num; @@ -179,5 +185,7 @@ int venetcls_ntuple_status(struct seq_file *seq, void *v); int vecls_flow_res_init(void); void vecls_flow_res_clean(void); int venetcls_flow_status(struct seq_file *seq, void *v); +void _vecls_flow_update(struct sock *sk, struct sk_buff *skb); +void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail); #endif /* _NET_VENETCLS_H */ diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c index 4ca2191d0718..85d1abe4c708 100644 --- a/net/venetcls/venetcls_flow.c +++ b/net/venetcls/venetcls_flow.c @@ -5,11 +5,17 @@ #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/inet.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <net/ipv6.h> +#include <net/inet_sock.h> #include <linux/venetcls.h> #include <net/sock.h> #include "venetcls.h" +static u16 *rps_cpus; +static int rps_cpus_nums; static u32 vecls_cpu_mask; static struct vecls_sock_flow_table __rcu *vecls_sock_flow_table; static DEFINE_MUTEX(vecls_sock_flow_mutex); @@ -61,22 +67,47 @@ static bool _vecls_timeout(struct net_device *dev, u16 rxq_index, return expire; } -static void _vecls_flow_update(struct sock *sk) +static inline bool sk_is_loopback(struct sock *sk) +{ + if (sk->sk_family == AF_INET) { + if (ipv4_is_loopback(sk->sk_daddr) && ipv4_is_loopback(sk->sk_rcv_saddr)) + return true; + } + + if (sk->sk_family == AF_INET6) { + if (ipv6_addr_loopback(&sk->sk_v6_daddr) && + ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + return true; + } + return false; +} + +void _vecls_flow_update(struct sock *sk, struct sk_buff *skb) { struct vecls_sock_flow_table *tb; unsigned int hash, index; - u32 val; - u32 cpu = raw_smp_processor_id(); + u32 val, cpu; if (sk->sk_state != TCP_ESTABLISHED) return; - if (check_appname(current->comm)) + if (unlikely(sk->vecls_cmd_matched == VECLS_CMD_UNKNOWN)) { + if (check_appname(current->comm)) { + sk->vecls_cmd_matched = VECLS_CMD_NO_MATCH; + return; + } + sk->vecls_cmd_matched = VECLS_CMD_MATCHED; + } + if (sk->vecls_cmd_matched != VECLS_CMD_MATCHED) return; + cpu = raw_smp_processor_id(); rcu_read_lock(); tb = rcu_dereference(vecls_sock_flow_table); - hash = READ_ONCE(sk->sk_rxhash); + if (lo_numa_rps && skb && sk_is_loopback(sk)) + hash = READ_ONCE(skb->sym_hash); + else + hash = READ_ONCE(sk->sk_rxhash); if (tb && hash) { index = hash & tb->mask; val = hash & ~vecls_cpu_mask; @@ -185,7 +216,7 @@ static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, rflow->cpu = next_cpu; } -static int get_cpu_in_numa(int tcpu, u32 hash) +static inline int get_cpu_in_numa(int tcpu, u32 hash) { const struct cpumask *mask; int nr_cpus, cpu, index; @@ -249,7 +280,40 @@ static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); } -static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) +static inline void loopback_numa_rps(struct sk_buff *skb, int *rcpu) +{ + struct vecls_sock_flow_table *stb; + u32 last_recv_cpu, hash, val; + int newcpu, index; + + skb_reset_network_header(skb); + hash = __skb_get_hash_symmetric(skb); + if (!hash) + return; + + WRITE_ONCE(skb->sym_hash, hash); + rcu_read_lock(); + stb = rcu_dereference(vecls_sock_flow_table); + if (stb) { + val = READ_ONCE(stb->ents[hash & stb->mask]); + last_recv_cpu = val & vecls_cpu_mask; + } else { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + if ((val ^ hash) & ~vecls_cpu_mask) + return; + + newcpu = cpumask_first(cpumask_of_node(cpu_to_node(last_recv_cpu))); + index = rps_cpus[reciprocal_scale(hash, rps_cpus_nums - 1)]; + newcpu += index; + *rcpu = newcpu; + vecls_debug("last:%u curcpu:%d newcpu:%d\n", last_recv_cpu, raw_smp_processor_id(), newcpu); +} + +void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) { struct net_device *ndev = skb->dev; struct vecls_sock_flow_table *stb; @@ -262,6 +326,9 @@ static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) if (!ndev) return; + if (lo_numa_rps && (ndev->type == ARPHRD_LOOPBACK && ndev->flags & IFF_LOOPBACK)) + loopback_numa_rps(skb, cpu); + if (!is_vecls_config_netdev(ndev->name)) return; @@ -399,6 +466,7 @@ static int vecls_dev_flow_table_init(void) static const struct vecls_hook_ops vecls_flow_ops = { .vecls_flow_update = _vecls_flow_update, .vecls_set_cpu = _vecls_set_cpu, + .vecls_set_localcpu = NULL, .vecls_timeout = _vecls_timeout, .vecls_cfg_rxcls = NULL, }; @@ -415,6 +483,7 @@ static int vecls_sock_flow_table_release(void) mutex_unlock(&vecls_sock_flow_mutex); synchronize_rcu(); vfree(tb); + kfree(rps_cpus); return 0; } @@ -469,10 +538,20 @@ static int vecls_sock_flow_table_init(void) struct vecls_sock_flow_table *table; int size = sft_num, i; + rps_cpus_nums = cpumask_weight(cpumask_of_node(0)); + rps_cpus = kmalloc_array(rps_cpus_nums, sizeof(u16), GFP_KERNEL); + if (!rps_cpus) + return -ENOMEM; + for (i = 0; i < rps_cpus_nums; i++) + rps_cpus[i] = i; + vecls_debug("rps_cpus_nums:%d\n", rps_cpus_nums); + size = roundup_pow_of_two(size); table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size)); - if (!table) + if (!table) { + kfree(rps_cpus); return -ENOMEM; + } vecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; vecls_debug("nr_cpu_ids:%d, vecls_cpu_mask:0x%x\n", nr_cpu_ids, vecls_cpu_mask); @@ -499,7 +578,8 @@ int vecls_flow_res_init(void) if (err) goto clean; - RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); + if (mode != 0) //for lo rps + RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); synchronize_rcu(); return 0; diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c index 30f7a2bd4570..e1c74b4b669c 100644 --- a/net/venetcls/venetcls_main.c +++ b/net/venetcls/venetcls_main.c @@ -8,6 +8,7 @@ #include <linux/proc_fs.h> #include <linux/rtnetlink.h> #include <linux/seq_file.h> +#include <linux/venetcls.h> #include "venetcls.h" int vecls_netdev_num; @@ -17,11 +18,15 @@ int vecls_numa_num; static int vecls_cluster_cpu_num, vecls_cluster_per_numa; static struct vecls_numa_info *vecls_numa_info_table; +int lo_numa_rps; +module_param(lo_numa_rps, int, 0644); +MODULE_PARM_DESC(lo_numa_rps, "enable loopback flow numa affinity"); + int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug switch"); -static int mode; +int mode; module_param(mode, int, 0444); MODULE_PARM_DESC(mode, "mode, default 0"); @@ -516,7 +521,8 @@ static int init_single_vecls_dev(char *if_name, unsigned int length) ret = vecls_filter_enable(dev_name, &old_state); if (ret) { vecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); - goto out; + if (lo_numa_rps) + goto out; } vecls_dev = alloc_vecls_netdev_info(); @@ -1111,10 +1117,15 @@ static __init int vecls_init(void) set_netdev_xps_queue(true); #endif - if (mode == 0) + if (mode == 0) { err = vecls_ntuple_res_init(); - else + if (err) + goto clean_rxq; + if (lo_numa_rps) + err = vecls_flow_res_init(); + } else { err = vecls_flow_res_init(); + } if (err) goto clean_rxq; @@ -1126,6 +1137,8 @@ static __init int vecls_init(void) goto clean_rxq; } #endif + if (lo_numa_rps) + static_branch_inc(&vecls_localrps_needed); return 0; @@ -1138,13 +1151,19 @@ static __init int vecls_init(void) static __exit void vecls_exit(void) { + if (lo_numa_rps) + static_branch_dec(&vecls_localrps_needed); + #ifdef CONFIG_PROC_FS remove_proc_entry("venet_status", init_net.proc_net); #endif - if (mode == 0) + if (mode == 0) { vecls_ntuple_res_clean(); - else + if (lo_numa_rps) + vecls_flow_res_clean(); + } else { vecls_flow_res_clean(); + } #ifdef CONFIG_XPS set_netdev_xps_queue(false); diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c index ad3c10f8ae5f..ac73d548fdee 100644 --- a/net/venetcls/venetcls_ntuple.c +++ b/net/venetcls/venetcls_ntuple.c @@ -681,7 +681,8 @@ int venetcls_ntuple_status(struct seq_file *seq, void *v) } static const struct vecls_hook_ops vecls_ntuple_ops = { - .vecls_flow_update = NULL, + .vecls_flow_update = _vecls_flow_update, + .vecls_set_localcpu = _vecls_set_cpu, .vecls_set_cpu = NULL, .vecls_timeout = NULL, .vecls_cfg_rxcls = ethtool_cfg_rxcls, -- 2.34.1