Currently, the 10G 82599 network adapter is used for the test on the Intel(R) Xeon(R) Gold 6138 CPU @ 2.00 GHz platform. The peer end uses the pktgen of the dpdk to send 64 bytes packets. The test performance is as follows:
-----------------------| CPU |--| rxpck/s |--| txpck/s |---- without patch (1 dev): XDP_DROP: 1.00% 14.69Mpps XDP_TX: 2.20% 10.70Mpps 10.70Mpps XDP_DROP (RSS): 1.90% 10.13Mpps XDP_TX (RSS): 6.20% 10.09Mpps 10.09Mpps ----------------------- with patch, bond (1 dev): XDP_DROP: 1.00% 14.69Mpps XDP_TX: 2.10% 6.84Mpps 6.84Mpps XDP_DROP (RSS): 2.00% 10.20pps XDP_TX (RSS): 8.40% 10.17Mpps 10.17Mpps ----------------------- with patch, bond (2 devs): XDP_DROP: 1.90% 29.3Mpps XDP_TX: 4.20% 11.31Mpps 11.31Mpps XDP_DROP (RSS): 3.90% 20.40Mpps XDP_TX (RSS): 16.80% 18.55Mpps 18.55Mpps --------------------------------------------------------------
Jussi Maki (5): net, bonding: Refactor bond_xmit_hash for use with xdp_buff net, core: Add support for XDP redirection to slave device net, bonding: Add XDP support to the bonding driver bonding: Fix negative jump label count on nested bonding net: bonding: Use per-cpu rr_tx_counter
drivers/net/bonding/bond_main.c | 460 ++++++++++++++++++++++++++++---- include/linux/filter.h | 13 +- include/linux/netdevice.h | 6 + include/net/bonding.h | 3 +- net/core/dev.c | 13 +- net/core/filter.c | 25 ++ 6 files changed, 459 insertions(+), 61 deletions(-)
From: Jussi Maki joamaki@gmail.com
mainline inclusion from mainline-v5.15-rc1 commit a815bde56b15ce626caaacc952ab12501671e45d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NDRB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
In preparation for adding XDP support to the bonding driver refactor the packet hashing functions to be able to work with any linear data buffer without an skb.
Signed-off-by: Jussi Maki joamaki@gmail.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Cc: Jay Vosburgh j.vosburgh@gmail.com Cc: Veaceslav Falico vfalico@gmail.com Cc: Andy Gospodarek andy@greyhouse.net Link: https://lore.kernel.org/bpf/20210731055738.16820-2-joamaki@gmail.com
Conflicts: drivers/net/bonding/bond_main.c
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- drivers/net/bonding/bond_main.c | 136 +++++++++++++++++++------------- 1 file changed, 81 insertions(+), 55 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ec0a4d942c08..ec5e55c53dde 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3601,66 +3601,84 @@ static struct notifier_block bond_netdev_notifier = { };
/*---------------------------- Hashing Policies -----------------------------*/ +/* Helper to access data in a packet, with or without a backing skb. + * If skb is given the data is linearized if necessary via pskb_may_pull. + */ +static inline void *bond_pull_data(struct sk_buff *skb, + void *data, int hlen, int n) +{ + if (likely(n <= hlen)) + return data; + else if (skb && likely(pskb_may_pull(skb, n))) + return skb->head; + + return NULL; +}
/* L2 hash helper */ -static inline u32 bond_eth_hash(struct sk_buff *skb) +static inline u32 bond_eth_hash(struct sk_buff *skb, void *data, int mhoff, int hlen) { - struct ethhdr *ep, hdr_tmp; + struct ethhdr *ep;
- ep = skb_header_pointer(skb, 0, sizeof(hdr_tmp), &hdr_tmp); - if (ep) - return ep->h_dest[5] ^ ep->h_source[5] ^ ep->h_proto; - return 0; + data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); + if (!data) + return 0; + + ep = (struct ethhdr *)(data + mhoff); + return ep->h_dest[5] ^ ep->h_source[5] ^ ep->h_proto; }
-static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, - int *noff, int *proto, bool l34) +static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, void *data, + int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34) { const struct ipv6hdr *iph6; const struct iphdr *iph;
- if (skb->protocol == htons(ETH_P_IP)) { - if (unlikely(!pskb_may_pull(skb, *noff + sizeof(*iph)))) + if (l2_proto == htons(ETH_P_IP)) { + data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph)); + if (!data) return false; - iph = (const struct iphdr *)(skb->data + *noff); + + iph = (const struct iphdr *)(data + *nhoff); iph_to_flow_copy_v4addrs(fk, iph); - *noff += iph->ihl << 2; + *nhoff += iph->ihl << 2; if (!ip_is_fragment(iph)) - *proto = iph->protocol; - } else if (skb->protocol == htons(ETH_P_IPV6)) { - if (unlikely(!pskb_may_pull(skb, *noff + sizeof(*iph6)))) + *ip_proto = iph->protocol; + } else if (l2_proto == htons(ETH_P_IPV6)) { + data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6)); + if (!data) return false; - iph6 = (const struct ipv6hdr *)(skb->data + *noff); + + iph6 = (const struct ipv6hdr *)(data + *nhoff); iph_to_flow_copy_v6addrs(fk, iph6); - *noff += sizeof(*iph6); - *proto = iph6->nexthdr; + *nhoff += sizeof(*iph6); + *ip_proto = iph6->nexthdr; } else { return false; }
- if (l34 && *proto >= 0) - fk->ports.ports = skb_flow_get_ports(skb, *noff, *proto); + if (l34 && *ip_proto >= 0) + fk->ports.ports = __skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen);
return true; }
/* Extract the appropriate headers based on bond's xmit policy */ -static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, - struct flow_keys *fk) +static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, void *data, + __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk) { bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34; - int noff, proto = -1; + int ip_proto = -1;
if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) { memset(fk, 0, sizeof(*fk)); return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, - fk, NULL, 0, 0, 0, 0); + fk, data, l2_proto, nhoff, hlen, 0); }
fk->ports.ports = 0; memset(&fk->icmp, 0, sizeof(fk->icmp)); - noff = skb_network_offset(skb); - if (!bond_flow_ip(skb, fk, &noff, &proto, l34)) + if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34)) return false;
/* ICMP error packets contains at least 8 bytes of the header @@ -3668,51 +3686,41 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, * to correlate ICMP error packets within the same flow which * generated the error. */ - if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) { - skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data, - skb_transport_offset(skb), - skb_headlen(skb)); - if (proto == IPPROTO_ICMP) { + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen); + if (ip_proto == IPPROTO_ICMP) { if (!icmp_is_err(fk->icmp.type)) return true;
- noff += sizeof(struct icmphdr); - } else if (proto == IPPROTO_ICMPV6) { + nhoff += sizeof(struct icmphdr); + } else if (ip_proto == IPPROTO_ICMPV6) { if (!icmpv6_is_err(fk->icmp.type)) return true;
- noff += sizeof(struct icmp6hdr); + nhoff += sizeof(struct icmp6hdr); } - return bond_flow_ip(skb, fk, &noff, &proto, l34); + return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34); }
return true; } - -/** - * bond_xmit_hash - generate a hash value based on the xmit policy - * @bond: bonding device - * @skb: buffer to use for headers - * - * This function will extract the necessary headers from the skb buffer and use - * them to generate a hash based on the xmit_policy set in the bonding device +/* Generate hash based on xmit policy. If @skb is given it is used to linearize + * the data as required, but this function can be used without it if the data is + * known to be linear (e.g. with xdp_buff). */ -u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) +static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, void *data, + __be16 l2_proto, int mhoff, int nhoff, int hlen) { struct flow_keys flow; u32 hash;
- if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 && - skb->l4_hash) - return skb->hash; - if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 || - !bond_flow_dissect(bond, skb, &flow)) - return bond_eth_hash(skb); + !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow)) + return bond_eth_hash(skb, data, mhoff, hlen);
if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) { - hash = bond_eth_hash(skb); + hash = bond_eth_hash(skb, data, mhoff, hlen); } else { if (flow.icmp.id) memcpy(&hash, &flow.icmp, sizeof(hash)); @@ -3727,6 +3735,25 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) return hash >> 1; }
+/** + * bond_xmit_hash - generate a hash value based on the xmit policy + * @bond: bonding device + * @skb: buffer to use for headers + * + * This function will extract the necessary headers from the skb buffer and use + * them to generate a hash based on the xmit_policy set in the bonding device + */ +u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) +{ + if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 && + skb->l4_hash) + return skb->hash; + + return __bond_xmit_hash(bond, skb, skb->head, skb->protocol, + skb->mac_header, skb->network_header, + skb_headlen(skb)); +} + /*-------------------------- Device entry points ----------------------------*/
void bond_work_init_all(struct bonding *bond) @@ -4373,8 +4400,7 @@ static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, return bond_tx_drop(bond_dev, skb); }
-static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond, - struct sk_buff *skb) +static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond) { return rcu_dereference(bond->curr_active_slave); } @@ -4388,7 +4414,7 @@ static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb, struct bonding *bond = netdev_priv(bond_dev); struct slave *slave;
- slave = bond_xmit_activebackup_slave_get(bond, skb); + slave = bond_xmit_activebackup_slave_get(bond); if (slave) return bond_dev_queue_xmit(bond, skb, slave->dev);
@@ -4737,7 +4763,7 @@ static struct net_device *bond_xmit_get_slave(struct net_device *master_dev, slave = bond_xmit_roundrobin_slave_get(bond, skb); break; case BOND_MODE_ACTIVEBACKUP: - slave = bond_xmit_activebackup_slave_get(bond, skb); + slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR:
From: Jussi Maki joamaki@gmail.com
mainline inclusion from mainline-v5.15-rc1 commit 879af96ffd72706c6e3278ea6b45b0b0e37ec5d7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NDRB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
This adds the ndo_xdp_get_xmit_slave hook for transforming XDP_TX into XDP_REDIRECT after BPF program run when the ingress device is a bond slave.
The dev_xdp_prog_count is exposed so that slave devices can be checked for loaded XDP programs in order to avoid the situation where both bond master and slave have programs loaded according to xdp_state.
Signed-off-by: Jussi Maki joamaki@gmail.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Cc: Jay Vosburgh j.vosburgh@gmail.com Cc: Veaceslav Falico vfalico@gmail.com Cc: Andy Gospodarek andy@greyhouse.net Link: https://lore.kernel.org/bpf/20210731055738.16820-3-joamaki@gmail.com
Conflicts: include/linux/filter.h include/linux/netdevice.h net/core/dev.c net/core/filter.c
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- include/linux/filter.h | 13 ++++++++++++- include/linux/netdevice.h | 6 ++++++ net/core/dev.c | 13 ++++++++++++- net/core/filter.c | 25 +++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 2 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h index 0418d88aa0f3..a2c9ca9626e9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -747,6 +747,10 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
DECLARE_BPF_DISPATCHER(xdp)
+DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp); + static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { @@ -756,7 +760,14 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * already takes rcu_read_lock() when fetching the program, so * it's not necessary here anymore. */ - return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + + if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { + if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) + act = xdp_master_redirect(xdp); + } + + return act; }
void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 47ebb927d4c2..0dc0eb32395f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1301,6 +1301,9 @@ struct netdev_net_notifier { * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. + * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev, + * struct xdp_buff *xdp); + * Get the xmit slave of master device based on the xdp_buff. * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); * This function is used to wake up the softirq, ksoftirqd or kthread * responsible for sending and/or receiving packets on a specific @@ -1519,6 +1522,8 @@ struct net_device_ops { int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); + struct net_device * (*ndo_xdp_get_xmit_slave)(struct net_device *dev, + struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); @@ -3994,6 +3999,7 @@ typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +u8 dev_xdp_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
int xdp_umem_query(struct net_device *dev, u16 queue_id); diff --git a/net/core/dev.c b/net/core/dev.c index 02bf39500817..5a1994be7d84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9037,7 +9037,7 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; }
-static u8 dev_xdp_prog_count(struct net_device *dev) +u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; @@ -9047,6 +9047,7 @@ static u8 dev_xdp_prog_count(struct net_device *dev) count++; return count; } +EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { @@ -9140,6 +9141,8 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; + struct net_device *upper; + struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; @@ -9178,6 +9181,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; }
+ /* don't allow if an upper device already has a program */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (dev_xdp_prog_count(upper) > 0) { + NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); + return -EEXIST; + } + } + cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { diff --git a/net/core/filter.c b/net/core/filter.c index d42b0d4f7d8a..13fab0b5a969 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3982,6 +3982,31 @@ void bpf_clear_redirect_map(struct bpf_map *map) } }
+DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); +EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp) +{ + struct net_device *master, *slave; + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); + slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); + if (slave && slave != xdp->rxq->dev) { + /* The target device is different from the receiving device, so + * redirect it to the new device. + * Using XDP_REDIRECT gets the correct behaviour from XDP enabled + * drivers to unmap the packet from their rx ring. + */ + ri->tgt_index = slave->ifindex; + //ri->map_id = INT_MAX; TODO: how to usage? + //ri->map_type = BPF_MAP_TYPE_UNSPEC; + return XDP_REDIRECT; + } + return XDP_TX; +} +EXPORT_SYMBOL_GPL(xdp_master_redirect); + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) {
From: Jussi Maki joamaki@gmail.com
mainline inclusion from mainline-v5.15-rc1 commit 9e2ee5c7e7c35d195e2aa0692a7241d47a433d1e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NDRB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
XDP is implemented in the bonding driver by transparently delegating the XDP program loading, removal and xmit operations to the bonding slave devices. The overall goal of this work is that XDP programs can be attached to a bond device *without* any further changes (or awareness) necessary to the program itself, meaning the same XDP program can be attached to a native device but also a bonding device.
Semantics of XDP_TX when attached to a bond are equivalent in such setting to the case when a tc/BPF program would be attached to the bond, meaning transmitting the packet out of the bond itself using one of the bond's configured xmit methods to select a slave device (rather than XDP_TX on the slave itself). Handling of XDP_TX to transmit using the configured bonding mechanism is therefore implemented by rewriting the BPF program return value in bpf_prog_run_xdp. To avoid performance impact this check is guarded by a static key, which is incremented when a XDP program is loaded onto a bond device. This approach was chosen to avoid changes to drivers implementing XDP. If the slave device does not match the receive device, then XDP_REDIRECT is transparently used to perform the redirection in order to have the network driver release the packet from its RX ring. The bonding driver hashing functions have been refactored to allow reuse with xdp_buff's to avoid code duplication.
The motivation for this change is to enable use of bonding (and 802.3ad) in hairpinning L4 load-balancers such as [1] implemented with XDP and also to transparently support bond devices for projects that use XDP given most modern NICs have dual port adapters. An alternative to this approach would be to implement 802.3ad in user-space and implement the bonding load-balancing in the XDP program itself, but is rather a cumbersome endeavor in terms of slave device management (e.g. by watching netlink) and requires separate programs for native vs bond cases for the orchestrator. A native in-kernel implementation overcomes these issues and provides more flexibility.
Below are benchmark results done on two machines with 100Gbit Intel E810 (ice) NIC and with 32-core 3970X on sending machine, and 16-core 3950X on receiving machine. 64 byte packets were sent with pktgen-dpdk at full rate. Two issues [2, 3] were identified with the ice driver, so the tests were performed with iommu=off and patch [2] applied. Additionally the bonding round robin algorithm was modified to use per-cpu tx counters as high CPU load (50% vs 10%) and high rate of cache misses were caused by the shared rr_tx_counter (see patch 2/3). The statistics were collected using "sar -n dev -u 1 10". On top of that, for ice, further work is in progress on improving the XDP_TX numbers [4].
-----------------------| CPU |--| rxpck/s |--| txpck/s |---- without patch (1 dev): XDP_DROP: 3.15% 48.6Mpps XDP_TX: 3.12% 18.3Mpps 18.3Mpps XDP_DROP (RSS): 9.47% 116.5Mpps XDP_TX (RSS): 9.67% 25.3Mpps 24.2Mpps ----------------------- with patch, bond (1 dev): XDP_DROP: 3.14% 46.7Mpps XDP_TX: 3.15% 13.9Mpps 13.9Mpps XDP_DROP (RSS): 10.33% 117.2Mpps XDP_TX (RSS): 10.64% 25.1Mpps 24.0Mpps ----------------------- with patch, bond (2 devs): XDP_DROP: 6.27% 92.7Mpps XDP_TX: 6.26% 17.6Mpps 17.5Mpps XDP_DROP (RSS): 11.38% 117.2Mpps XDP_TX (RSS): 14.30% 28.7Mpps 27.4Mpps --------------------------------------------------------------
RSS: Receive Side Scaling, e.g. the packets were sent to a range of destination IPs.
[1]: https://cilium.io/blog/2021/05/20/cilium-110#standalonelb [2]: https://lore.kernel.org/bpf/20210601113236.42651-1-maciej.fijalkowski@intel.... [3]: https://lore.kernel.org/bpf/CAHn8xckNXci+X_Eb2WMv4uVYjO2331UWB2JLtXr_58z0Av8... [4]: https://lore.kernel.org/bpf/20210805230046.28715-1-maciej.fijalkowski@intel....
Signed-off-by: Jussi Maki joamaki@gmail.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Cc: Jay Vosburgh j.vosburgh@gmail.com Cc: Veaceslav Falico vfalico@gmail.com Cc: Andy Gospodarek andy@greyhouse.net Cc: Maciej Fijalkowski maciej.fijalkowski@intel.com Cc: Magnus Karlsson magnus.karlsson@intel.com Link: https://lore.kernel.org/bpf/20210731055738.16820-4-joamaki@gmail.com
Conflicts: drivers/net/bonding/bond_main.c include/net/bonding.h
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- drivers/net/bonding/bond_main.c | 309 +++++++++++++++++++++++++++++++- include/net/bonding.h | 1 + 2 files changed, 309 insertions(+), 1 deletion(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ec5e55c53dde..38c33d2a0a0a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -302,6 +302,19 @@ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, return dev_queue_xmit(skb); }
+static bool bond_xdp_check(struct bonding *bond) +{ + switch (BOND_MODE(bond)) { + case BOND_MODE_ROUNDROBIN: + case BOND_MODE_ACTIVEBACKUP: + case BOND_MODE_8023AD: + case BOND_MODE_XOR: + return true; + default: + return false; + } +} + /*---------------------------------- VLAN -----------------------------------*/
/* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid, @@ -2118,6 +2131,41 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, bond_update_slave_arr(bond, NULL);
+ if (!slave_dev->netdev_ops->ndo_bpf || + !slave_dev->netdev_ops->ndo_xdp_xmit) { + if (bond->xdp_prog) { + NL_SET_ERR_MSG(extack, "Slave does not support XDP"); + slave_err(bond_dev, slave_dev, "Slave does not support XDP\n"); + res = -EOPNOTSUPP; + goto err_sysfs_del; + } + } else { + struct netdev_bpf xdp = { + .command = XDP_SETUP_PROG, + .flags = 0, + .prog = bond->xdp_prog, + .extack = extack, + }; + + if (dev_xdp_prog_count(slave_dev) > 0) { + NL_SET_ERR_MSG(extack, + "Slave has XDP program loaded, please unload before enslaving"); + slave_err(bond_dev, slave_dev, + "Slave has XDP program loaded, please unload before enslaving\n"); + res = -EOPNOTSUPP; + goto err_sysfs_del; + } + + res = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); + if (res < 0) { + /* ndo_bpf() sets extack error message */ + slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res); + goto err_sysfs_del; + } + if (bond->xdp_prog) + bpf_prog_inc(bond->xdp_prog); + } + slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n", bond_is_active_slave(new_slave) ? "an active" : "a backup", new_slave->link != BOND_LINK_DOWN ? "an up" : "a down"); @@ -2235,6 +2283,17 @@ static int __bond_release_one(struct net_device *bond_dev, /* recompute stats just before removing the slave */ bond_get_stats(bond->dev, &bond->bond_stats);
+ if (bond->xdp_prog) { + struct netdev_bpf xdp = { + .command = XDP_SETUP_PROG, + .flags = 0, + .prog = NULL, + .extack = NULL, + }; + if (slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp)) + slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n"); + } + /* unregister rx_handler early so bond_handle_frame wouldn't be called * for this slave anymore. */ @@ -3625,7 +3684,7 @@ static inline u32 bond_eth_hash(struct sk_buff *skb, void *data, int mhoff, int return 0;
ep = (struct ethhdr *)(data + mhoff); - return ep->h_dest[5] ^ ep->h_source[5] ^ ep->h_proto; + return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto); }
static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, void *data, @@ -3754,6 +3813,26 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) skb_headlen(skb)); }
+/** + * bond_xmit_hash_xdp - generate a hash value based on the xmit policy + * @bond: bonding device + * @xdp: buffer to use for headers + * + * The XDP variant of bond_xmit_hash. + */ +static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp) +{ + struct ethhdr *eth; + + if (xdp->data + sizeof(struct ethhdr) > xdp->data_end) + return 0; + + eth = (struct ethhdr *)xdp->data; + + return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0, + sizeof(struct ethhdr), xdp->data_end - xdp->data); +} + /*-------------------------- Device entry points ----------------------------*/
void bond_work_init_all(struct bonding *bond) @@ -4387,6 +4466,47 @@ static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond, return NULL; }
+static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond, + struct xdp_buff *xdp) +{ + struct slave *slave; + int slave_cnt; + u32 slave_id; + const struct ethhdr *eth; + void *data = xdp->data; + + if (data + sizeof(struct ethhdr) > xdp->data_end) + goto non_igmp; + + eth = (struct ethhdr *)data; + data += sizeof(struct ethhdr); + + /* See comment on IGMP in bond_xmit_roundrobin_slave_get() */ + if (eth->h_proto == htons(ETH_P_IP)) { + const struct iphdr *iph; + + if (data + sizeof(struct iphdr) > xdp->data_end) + goto non_igmp; + + iph = (struct iphdr *)data; + + if (iph->protocol == IPPROTO_IGMP) { + slave = rcu_dereference(bond->curr_active_slave); + if (slave) + return slave; + return bond_get_slave_by_id(bond, 0); + } + } + +non_igmp: + slave_cnt = READ_ONCE(bond->slave_cnt); + if (likely(slave_cnt)) { + slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; + return bond_get_slave_by_id(bond, slave_id); + } + return NULL; +} + static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev) { @@ -4635,6 +4755,22 @@ static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond, return slave; }
+static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond, + struct xdp_buff *xdp) +{ + struct bond_up_slave *slaves; + unsigned int count; + u32 hash; + + hash = bond_xmit_hash_xdp(bond, xdp); + slaves = rcu_dereference(bond->usable_slaves); + count = slaves ? READ_ONCE(slaves->count) : 0; + if (unlikely(!count)) + return NULL; + + return slaves->arr[hash % count]; +} + /* Use this Xmit function for 3AD as well as XOR modes. The current * usable slave array is formed in the control path. The xmit function * just calculates hash and sends the packet out. @@ -4843,6 +4979,174 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) return ret; }
+static struct net_device * +bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp) +{ + struct bonding *bond = netdev_priv(bond_dev); + struct slave *slave; + + /* Caller needs to hold rcu_read_lock() */ + + switch (BOND_MODE(bond)) { + case BOND_MODE_ROUNDROBIN: + slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp); + break; + + case BOND_MODE_ACTIVEBACKUP: + slave = bond_xmit_activebackup_slave_get(bond); + break; + + case BOND_MODE_8023AD: + case BOND_MODE_XOR: + slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp); + break; + + default: + /* Should never happen. Mode guarded by bond_xdp_check() */ + netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond)); + WARN_ON_ONCE(1); + return NULL; + } + + if (slave) + return slave->dev; + + return NULL; +} + +static int bond_xdp_xmit(struct net_device *bond_dev, + int n, struct xdp_frame **frames, u32 flags) +{ + int nxmit, err = -ENXIO; + + rcu_read_lock(); + + for (nxmit = 0; nxmit < n; nxmit++) { + struct xdp_frame *frame = frames[nxmit]; + struct xdp_frame *frames1[] = {frame}; + struct net_device *slave_dev; + struct xdp_buff xdp; + + xdp_convert_frame_to_buff(frame, &xdp); + + slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp); + if (!slave_dev) { + err = -ENXIO; + break; + } + + err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags); + if (err < 1) + break; + } + + rcu_read_unlock(); + + /* If error happened on the first frame then we can pass the error up, otherwise + * report the number of frames that were xmitted. + */ + if (err < 0) + return (nxmit == 0 ? err : nxmit); + + return nxmit; +} + +static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct bonding *bond = netdev_priv(dev); + struct list_head *iter; + struct slave *slave, *rollback_slave; + struct bpf_prog *old_prog; + struct netdev_bpf xdp = { + .command = XDP_SETUP_PROG, + .flags = 0, + .prog = prog, + .extack = extack, + }; + int err; + + ASSERT_RTNL(); + + if (!bond_xdp_check(bond)) + return -EOPNOTSUPP; + + old_prog = bond->xdp_prog; + bond->xdp_prog = prog; + + bond_for_each_slave(bond, slave, iter) { + struct net_device *slave_dev = slave->dev; + + if (!slave_dev->netdev_ops->ndo_bpf || + !slave_dev->netdev_ops->ndo_xdp_xmit) { + NL_SET_ERR_MSG(extack, "Slave device does not support XDP"); + slave_err(dev, slave_dev, "Slave does not support XDP\n"); + err = -EOPNOTSUPP; + goto err; + } + + if (dev_xdp_prog_count(slave_dev) > 0) { + NL_SET_ERR_MSG(extack, + "Slave has XDP program loaded, please unload before enslaving"); + slave_err(dev, slave_dev, + "Slave has XDP program loaded, please unload before enslaving\n"); + err = -EOPNOTSUPP; + goto err; + } + + err = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); + if (err < 0) { + /* ndo_bpf() sets extack error message */ + slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err); + goto err; + } + if (prog) + bpf_prog_inc(prog); + } + + if (old_prog) + bpf_prog_put(old_prog); + + if (prog) + static_branch_inc(&bpf_master_redirect_enabled_key); + else + static_branch_dec(&bpf_master_redirect_enabled_key); + + return 0; + +err: + /* unwind the program changes */ + bond->xdp_prog = old_prog; + xdp.prog = old_prog; + xdp.extack = NULL; /* do not overwrite original error */ + + bond_for_each_slave(bond, rollback_slave, iter) { + struct net_device *slave_dev = rollback_slave->dev; + int err_unwind; + + if (slave == rollback_slave) + break; + + err_unwind = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); + if (err_unwind < 0) + slave_err(dev, slave_dev, + "Error %d when unwinding XDP program change\n", err_unwind); + else if (xdp.prog) + bpf_prog_inc(xdp.prog); + } + return err; +} + +static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return bond_xdp_set(dev, xdp->prog, xdp->extack); + default: + return -EINVAL; + } +} + static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) { if (speed == 0 || speed == SPEED_UNKNOWN) @@ -4928,6 +5232,9 @@ static const struct net_device_ops bond_netdev_ops = { .ndo_fix_features = bond_fix_features, .ndo_features_check = passthru_features_check, .ndo_get_xmit_slave = bond_xmit_get_slave, + .ndo_bpf = bond_xdp, + .ndo_xdp_xmit = bond_xdp_xmit, + .ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave, };
static const struct device_type bond_type = { diff --git a/include/net/bonding.h b/include/net/bonding.h index 467c9dac43ca..bb238e322e75 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -258,6 +258,7 @@ struct bonding { /* protecting ipsec_list */ spinlock_t ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ + struct bpf_prog *xdp_prog; };
#define bond_slave_get_rcu(dev) \
From: Jussi Maki joamaki@gmail.com
mainline inclusion from mainline-v5.15-rc1 commit 6d5f1ef838683efba01bacb7854f6516fbcbae17 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NDRB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
With nested bonding devices the nested bond device's ndo_bpf was called without a program causing it to decrement the static key without a prior increment leading to negative count.
Fix the issue by 1) only calling slave's ndo_bpf when there's a program to be loaded and 2) only decrement the count when a program is unloaded.
Fixes: 9e2ee5c7e7c3 ("net, bonding: Add XDP support to the bonding driver") Reported-by: syzbot+30622fb04ddd72a4d167@syzkaller.appspotmail.com Signed-off-by: Jussi Maki joamaki@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- drivers/net/bonding/bond_main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 38c33d2a0a0a..25677c8db1cb 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2139,7 +2139,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, res = -EOPNOTSUPP; goto err_sysfs_del; } - } else { + } else if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, @@ -5104,13 +5104,12 @@ static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, bpf_prog_inc(prog); }
- if (old_prog) - bpf_prog_put(old_prog); - - if (prog) + if (prog) { static_branch_inc(&bpf_master_redirect_enabled_key); - else + } else if (old_prog) { + bpf_prog_put(old_prog); static_branch_dec(&bpf_master_redirect_enabled_key); + }
return 0;
From: Jussi Maki joamaki@gmail.com
mainline inclusion from mainline-v5.14-rc1 commit 848ca9182a7d25bb54955c3aab9a3a2742bf9678 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NDRB CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
The round-robin rr_tx_counter was shared across CPUs leading to significant cache thrashing at high packet rates. This patch switches the round-robin packet counter to use a per-cpu variable to decide the destination slave.
On a test with 2x100Gbit ICE nic with pktgen_sample_04_many_flows.sh (-s 64 -t 32) the tx rate was 19.6Mpps before and 22.3Mpps after this patch.
"perf top -e cache_misses" before: 12.31% [bonding] [k] bond_xmit_roundrobin_slave_get 10.59% [sch_fq_codel] [k] fq_codel_dequeue 9.34% [kernel] [k] skb_release_data after: 15.42% [sch_fq_codel] [k] fq_codel_dequeue 10.06% [kernel] [k] __memset 9.12% [kernel] [k] skb_release_data
Signed-off-by: Jussi Maki joamaki@gmail.com Signed-off-by: David S. Miller davem@davemloft.net
Conflicts: drivers/net/bonding/bond_main.c
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- drivers/net/bonding/bond_main.c | 18 +++++++++++++++--- include/net/bonding.h | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 25677c8db1cb..63e623cfef80 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4414,16 +4414,16 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond) slave_id = prandom_u32(); break; case 1: - slave_id = bond->rr_tx_counter; + slave_id = this_cpu_inc_return(*bond->rr_tx_counter); break; default: reciprocal_packets_per_slave = bond->params.reciprocal_packets_per_slave; - slave_id = reciprocal_divide(bond->rr_tx_counter, + slave_id = this_cpu_inc_return(*bond->rr_tx_counter); + slave_id = reciprocal_divide(slave_id, reciprocal_packets_per_slave); break; } - bond->rr_tx_counter++;
return slave_id; } @@ -5245,6 +5245,9 @@ static void bond_destructor(struct net_device *bond_dev) struct bonding *bond = netdev_priv(bond_dev); if (bond->wq) destroy_workqueue(bond->wq); + + if (bond->rr_tx_counter) + free_percpu(bond->rr_tx_counter); }
void bond_setup(struct net_device *bond_dev) @@ -5744,6 +5747,15 @@ static int bond_init(struct net_device *bond_dev) if (!bond->wq) return -ENOMEM;
+ if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN) { + bond->rr_tx_counter = alloc_percpu(u32); + if (!bond->rr_tx_counter) { + destroy_workqueue(bond->wq); + bond->wq = NULL; + return -ENOMEM; + } + } + spin_lock_init(&bond->stats_lock); netdev_lockdep_set_classes(bond_dev);
diff --git a/include/net/bonding.h b/include/net/bonding.h index bb238e322e75..7b07700477fc 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -237,7 +237,7 @@ struct bonding { char proc_file_name[IFNAMSIZ]; #endif /* CONFIG_PROC_FS */ struct list_head bond_list; - u32 rr_tx_counter; + u32 __percpu *rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/1516 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Y...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/1516 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Y...