From: Yue Haibing <yuehaibing@huawei.com> hulk inclusion category: feature Link: https://gitee.com/openeuler/kernel/issues/ICBFCS CVE: NA -------------------------------- Use NUMA-aware flow tables for local flows to achieve better cache effectiveness and NUMA affinity. Also cache check_appname results in sk to avoid unnecessary dup check. Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- include/linux/oenetcls.h | 40 ++++++++++-- include/linux/skbuff.h | 4 ++ include/net/sock.h | 4 ++ net/core/dev.c | 8 +++ net/core/sock.c | 3 + net/ipv4/tcp.c | 5 +- net/oenetcls/oenetcls.h | 7 +++ net/oenetcls/oenetcls_flow.c | 112 +++++++++++++++++++++++++++++---- net/oenetcls/oenetcls_main.c | 30 +++++++-- net/oenetcls/oenetcls_ntuple.c | 3 +- 10 files changed, 191 insertions(+), 25 deletions(-) diff --git a/include/linux/oenetcls.h b/include/linux/oenetcls.h index 09f89131f32b..b618aa6b807f 100644 --- a/include/linux/oenetcls.h +++ b/include/linux/oenetcls.h @@ -2,10 +2,13 @@ #ifndef _LINUX_OENETCLS_H #define _LINUX_OENETCLS_H +#include <linux/if_arp.h> + struct oecls_hook_ops { void (*oecls_cfg_rxcls)(struct sock *sk, int is_del); - void (*oecls_flow_update)(struct sock *sk); + void (*oecls_flow_update)(struct sock *sk, struct sk_buff *skb); void (*oecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail); + void (*oecls_set_localcpu)(struct sk_buff *skb, int *cpu, int *last_qtail); bool (*oecls_timeout)(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); }; @@ -13,6 +16,7 @@ struct oecls_hook_ops { typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); extern const struct oecls_hook_ops __rcu *oecls_ops; extern struct static_key_false oecls_rps_needed; +extern struct static_key_false oecls_localrps_needed; static inline void oenetcls_cfg_rxcls(struct sock *sk, int is_del) { @@ -25,14 +29,14 @@ static inline void oenetcls_cfg_rxcls(struct sock *sk, int is_del) rcu_read_unlock(); } -static inline void oenetcls_flow_update(struct sock *sk) +static inline void oenetcls_flow_update(struct sock *sk, struct sk_buff *skb) { const struct oecls_hook_ops *ops; rcu_read_lock(); ops = rcu_dereference(oecls_ops); if (ops && ops->oecls_flow_update) - ops->oecls_flow_update(sk); + ops->oecls_flow_update(sk, skb); rcu_read_unlock(); } @@ -45,8 +49,16 @@ oenetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) rcu_read_lock(); ops = rcu_dereference(oecls_ops); - if (ops && ops->oecls_set_cpu) { - ops->oecls_set_cpu(skb, &cpu, &last_qtail); + if (ops) { + /* mode 1 always use oecls_set_cpu hook for physical NIC or lo. + * mode 0 set this hook to NULL, to avoid unneeded ops in + * oenetcls_skblist_set_cpu() for physical NIC flows, and use + * oecls_set_localcpu hook for loopback flows. + */ + if (ops->oecls_set_cpu) + ops->oecls_set_cpu(skb, &cpu, &last_qtail); + else if (ops->oecls_set_localcpu) + ops->oecls_set_localcpu(skb, &cpu, &last_qtail); if (cpu >= 0) { *ret = enq_func(skb, cpu, &last_qtail); result = true; @@ -56,6 +68,24 @@ oenetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) return result; } +static inline bool +oenetcls_skb_set_localcpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) +{ + struct net_device *dev = skb->dev; + bool result = false; + + if (!static_branch_unlikely(&oecls_localrps_needed)) + return result; + if (!dev || !(dev->type == ARPHRD_LOOPBACK && dev->flags & IFF_LOOPBACK)) + return result; + + preempt_disable(); + if (oenetcls_skb_set_cpu(skb, enq_func, ret)) + result = true; + preempt_enable(); + return result; +} + static inline void oenetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func) { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1597a5f9b5b8..0f985ba19006 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1056,7 +1056,11 @@ struct sk_buff { #else KABI_RESERVE(1) #endif +#if IS_ENABLED(CONFIG_OENETCLS) + KABI_USE(2, __u32 sym_hash) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/net/sock.h b/include/net/sock.h index 26456cb2bf8f..c44b2025bc54 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -554,7 +554,11 @@ struct sock { u64 sk_gid_padding; }; #endif +#if IS_ENABLED(CONFIG_OENETCLS) + KABI_USE(1, u8 oecls_cmd_matched) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/net/core/dev.c b/net/core/dev.c index 06d59a919a4b..f388233f4f75 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -164,6 +164,8 @@ const struct oecls_hook_ops __rcu *oecls_ops __read_mostly; EXPORT_SYMBOL_GPL(oecls_ops); struct static_key_false oecls_rps_needed __read_mostly; EXPORT_SYMBOL(oecls_rps_needed); +struct static_key_false oecls_localrps_needed __read_mostly; +EXPORT_SYMBOL(oecls_localrps_needed); #endif static DEFINE_SPINLOCK(ptype_lock); @@ -5196,6 +5198,12 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); +#if IS_ENABLED(CONFIG_OENETCLS) + if (static_branch_unlikely(&oecls_localrps_needed)) { + if (oenetcls_skb_set_localcpu(skb, enqueue_to_backlog, &ret)) + return ret; + } +#endif #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; diff --git a/net/core/sock.c b/net/core/sock.c index d63f5ee49054..45f7f9aaca46 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2172,6 +2172,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); +#if IS_ENABLED(CONFIG_OENETCLS) + sk->oecls_cmd_matched = 0; +#endif } return sk; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7c66c46c125f..2c98ef85072b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2543,6 +2543,9 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (used + offset < skb->len) continue; +#if IS_ENABLED(CONFIG_OENETCLS) + oenetcls_flow_update(sk, skb); +#endif if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) @@ -2587,7 +2590,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, return inet_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_OENETCLS) - oenetcls_flow_update(sk); + oenetcls_flow_update(sk, NULL); #endif if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h index 6d8e8e5e5b15..755d0ab299ee 100644 --- a/net/oenetcls/oenetcls.h +++ b/net/oenetcls/oenetcls.h @@ -13,6 +13,10 @@ #define OECLS_NO_FILTER 0xffff #define OECLS_NO_CPU 0xffff +#define OECLS_CMD_UNKNOWN 0 +#define OECLS_CMD_MATCHED 1 +#define OECLS_CMD_NO_MATCH 2 + struct oecls_netdev_queue_info { int irq; int affinity_cpu; @@ -135,6 +139,7 @@ extern int oecls_netdev_num; extern int oecls_numa_num; extern unsigned int dft_num; extern unsigned int sft_num; +extern int lo_numa_rps; #define oecls_debug(fmt, ...) \ do { \ @@ -183,5 +188,7 @@ int oecls_ntuple_res_init(void); void oecls_ntuple_res_clean(void); int oecls_flow_res_init(void); void oecls_flow_res_clean(void); +void _oecls_flow_update(struct sock *sk, struct sk_buff *skb); +void _oecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail); #endif /* _NET_OENETCLS_H */ diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c index 0953b4bd91ae..bb52a5b78c47 100644 --- a/net/oenetcls/oenetcls_flow.c +++ b/net/oenetcls/oenetcls_flow.c @@ -1,15 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/inetdevice.h> -#include <linux/netdevice.h> -#include <linux/rtnetlink.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/inet.h> #include <linux/irq.h> #include <linux/irqdesc.h> -#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/oenetcls.h> +#include <linux/rtnetlink.h> +#include <net/inet_sock.h> +#include <net/ipv6.h> #include <net/netdev_rx_queue.h> #include <net/sock.h> -#include <linux/oenetcls.h> #include "oenetcls.h" +static u16 *rps_cpus; +static int rps_cpus_nums; static u32 oecls_cpu_mask; static struct oecls_sock_flow_table __rcu *oecls_sock_flow_table; static DEFINE_MUTEX(oecls_sock_flow_mutex); @@ -59,22 +66,50 @@ static bool _oecls_timeout(struct net_device *dev, u16 rxq_index, return expire; } -static void _oecls_flow_update(struct sock *sk) +static inline bool sk_is_loopback(struct sock *sk) +{ + if (sk->sk_family == AF_INET) { + if (ipv4_is_loopback(sk->sk_daddr) || ipv4_is_loopback(sk->sk_rcv_saddr)) + return true; + } + + if (sk->sk_family == AF_INET6) { + if (ipv6_addr_loopback(&sk->sk_v6_daddr) || + ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) || + ipv6_addr_v4mapped_loopback(&sk->sk_v6_daddr) || + ipv6_addr_v4mapped_loopback(&sk->sk_v6_rcv_saddr) || + ipv6_addr_equal(&sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr)) + return true; + } + return false; +} + +void _oecls_flow_update(struct sock *sk, struct sk_buff *skb) { struct oecls_sock_flow_table *tb; unsigned int hash, index; - u32 val; - u32 cpu = raw_smp_processor_id(); + u32 val, cpu; if (sk->sk_state != TCP_ESTABLISHED) return; - if (check_appname(current->comm)) + if (unlikely(sk->oecls_cmd_matched == OECLS_CMD_UNKNOWN)) { + if (check_appname(current->comm)) { + sk->oecls_cmd_matched = OECLS_CMD_NO_MATCH; + return; + } + sk->oecls_cmd_matched = OECLS_CMD_MATCHED; + } + if (sk->oecls_cmd_matched != OECLS_CMD_MATCHED) return; + cpu = raw_smp_processor_id(); rcu_read_lock(); tb = rcu_dereference(oecls_sock_flow_table); - hash = READ_ONCE(sk->sk_rxhash); + if (lo_numa_rps && skb && sk_is_loopback(sk)) + hash = READ_ONCE(skb->sym_hash); + else + hash = READ_ONCE(sk->sk_rxhash); if (tb && hash) { index = hash & tb->mask; val = hash & ~oecls_cpu_mask; @@ -183,7 +218,7 @@ static bool oecls_do_hash(void) return get_random_u32() % 100 < rcpu_probability; } -static int get_cpu_in_mask(int tcpu, u32 hash) +static inline int get_cpu_in_mask(int tcpu, u32 hash) { const struct cpumask *mask; int nr_cpus, cpu, index; @@ -268,7 +303,40 @@ static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, set_oecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); } -static void _oecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) +static inline void loopback_numa_rps(struct sk_buff *skb, int *rcpu) +{ + struct oecls_sock_flow_table *stb; + u32 last_recv_cpu, hash, val; + int newcpu, index; + + skb_reset_network_header(skb); + hash = __skb_get_hash_symmetric(skb); + if (!hash) + return; + + WRITE_ONCE(skb->sym_hash, hash); + rcu_read_lock(); + stb = rcu_dereference(oecls_sock_flow_table); + if (stb) { + val = READ_ONCE(stb->ents[hash & stb->mask]); + last_recv_cpu = val & oecls_cpu_mask; + } else { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + if ((val ^ hash) & ~oecls_cpu_mask) + return; + + newcpu = cpumask_first(cpumask_of_node(cpu_to_node(last_recv_cpu))); + index = rps_cpus[reciprocal_scale(hash, rps_cpus_nums - 1)]; + newcpu += index; + *rcpu = newcpu; + oecls_debug("last:%u curcpu:%d newcpu:%d\n", last_recv_cpu, raw_smp_processor_id(), newcpu); +} + +void _oecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) { struct net_device *ndev = skb->dev; struct oecls_sock_flow_table *stb; @@ -281,6 +349,11 @@ static void _oecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) if (!ndev) return; + if (lo_numa_rps && (ndev->type == ARPHRD_LOOPBACK && ndev->flags & IFF_LOOPBACK)) { + loopback_numa_rps(skb, cpu); + return; + } + if (!is_oecls_config_netdev(ndev->name)) return; @@ -424,6 +497,7 @@ static int oecls_sock_flow_table_release(void) mutex_unlock(&oecls_sock_flow_mutex); synchronize_rcu(); vfree(tb); + kfree(rps_cpus); return 0; } @@ -433,10 +507,20 @@ static int oecls_sock_flow_table_init(void) struct oecls_sock_flow_table *table; int size = sft_num, i; + rps_cpus_nums = cpumask_weight(cpumask_of_node(0)); + rps_cpus = kmalloc_array(rps_cpus_nums, sizeof(u16), GFP_KERNEL); + if (!rps_cpus) + return -ENOMEM; + for (i = 0; i < rps_cpus_nums; i++) + rps_cpus[i] = i; + oecls_debug("rps_cpus_nums:%d\n", rps_cpus_nums); + size = roundup_pow_of_two(size); table = vmalloc(OECLS_SOCK_FLOW_TABLE_SIZE(size)); - if (!table) + if (!table) { + kfree(rps_cpus); return -ENOMEM; + } oecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; oecls_debug("nr_cpu_ids:%d, oecls_cpu_mask:0x%x\n", nr_cpu_ids, oecls_cpu_mask); @@ -455,6 +539,7 @@ static int oecls_sock_flow_table_init(void) static const struct oecls_hook_ops oecls_flow_ops = { .oecls_flow_update = _oecls_flow_update, .oecls_set_cpu = _oecls_set_cpu, + .oecls_set_localcpu = NULL, .oecls_timeout = _oecls_timeout, .oecls_cfg_rxcls = NULL, }; @@ -473,7 +558,8 @@ int oecls_flow_res_init(void) return err; } - RCU_INIT_POINTER(oecls_ops, &oecls_flow_ops); + if (mode != 0) //for lo rps + RCU_INIT_POINTER(oecls_ops, &oecls_flow_ops); synchronize_rcu(); #ifdef CONFIG_RPS diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c index f9574b344331..e6cffacca161 100644 --- a/net/oenetcls/oenetcls_main.c +++ b/net/oenetcls/oenetcls_main.c @@ -6,6 +6,7 @@ #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/rtnetlink.h> +#include <linux/oenetcls.h> #include "oenetcls.h" int oecls_netdev_num; @@ -59,6 +60,10 @@ unsigned int sft_num = 0x100000; module_param(sft_num, uint, 0444); MODULE_PARM_DESC(sft_num, "sock flow table entries, default 0x100000"); +int lo_numa_rps; +module_param(lo_numa_rps, int, 0644); +MODULE_PARM_DESC(lo_numa_rps, "enable loopback flow numa affinity"); + static bool check_params(void) { if (mode != 0 && mode != 1 && mode != 2) @@ -517,7 +522,8 @@ static int init_single_oecls_dev(char *if_name, unsigned int length) ret = oecls_filter_enable(dev_name, &old_state); if (ret) { oecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); - goto out; + if (lo_numa_rps) + goto out; } } @@ -1081,14 +1087,22 @@ static __init int oecls_init(void) if (mode == 2 && rcpu_probability < 0) fixup_rcpu_load(); - if (mode == 0) + if (mode == 0) { err = oecls_ntuple_res_init(); - else + if (err) + goto clean_rxq; + if (lo_numa_rps) + err = oecls_flow_res_init(); + } else { err = oecls_flow_res_init(); + } if (err) goto clean_rxq; + if (lo_numa_rps) + static_branch_inc(&oecls_localrps_needed); + return 0; clean_rxq: @@ -1100,10 +1114,16 @@ static __init int oecls_init(void) static __exit void oecls_exit(void) { - if (mode == 0) + if (lo_numa_rps) + static_branch_dec(&oecls_localrps_needed); + + if (mode == 0) { oecls_ntuple_res_clean(); - else + if (lo_numa_rps) + oecls_flow_res_clean(); + } else { oecls_flow_res_clean(); + } #ifdef CONFIG_XPS set_netdev_xps_queue(false); diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c index def33d30f642..c0b97ea7649e 100644 --- a/net/oenetcls/oenetcls_ntuple.c +++ b/net/oenetcls/oenetcls_ntuple.c @@ -582,7 +582,8 @@ static void clean_oecls_sk_rules(void) } static const struct oecls_hook_ops oecls_ntuple_ops = { - .oecls_flow_update = NULL, + .oecls_flow_update = _oecls_flow_update, + .oecls_set_localcpu = _oecls_set_cpu, .oecls_set_cpu = NULL, .oecls_timeout = NULL, .oecls_cfg_rxcls = ethtool_cfg_rxcls, -- 2.34.1