Yue Haibing (1): xsk: Add generic xdp multi-buffer recv support
bitcoffee (1): ipvlan: support use xdp native mode
drivers/net/ipvlan/ipvlan.h | 1 + drivers/net/ipvlan/ipvlan_core.c | 16 +++ drivers/net/ipvlan/ipvlan_main.c | 22 ++++ include/linux/netdevice.h | 7 ++ include/linux/skbuff.h | 6 + include/net/xdp.h | 42 +++++++ include/net/xdp_sock.h | 20 +++ include/net/xdp_sock_drv.h | 14 +++ include/net/xsk_buff_pool.h | 7 ++ include/uapi/linux/if_xdp.h | 13 ++ net/core/dev.c | 210 +++++++++++++++++++++++++++++++ net/core/filter.c | 4 + net/core/skbuff.c | 90 +++++++++++++ net/xdp/Kconfig | 8 ++ net/xdp/xsk.c | 168 +++++++++++++++++++++++++ net/xdp/xsk_queue.h | 62 +++++++++ 16 files changed, 690 insertions(+)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/12027 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Z...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/12027 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Z...
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAOZOH
----------------------------------------------
This is partialy backport from mainline e6d5dbdd20aa xdp: add multi-buff support for xdp running in generic mode c6a28acb1a27 net: fix pointer check in skb_pp_cow_data routine 4d2bb0bfe874 xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp fe21cb91ae7b net: core: Split out code to run generic XDP prog cf24f5a5feea xsk: add support for AF_XDP multi-buffer on Tx path 2127c6043836 xsk: Add truesize to skb_add_rx_frag(). 9d0a67b9d42c xsk: Fix xsk_build_skb() error: 'skb' dereferencing possible ERR_PTR() 1b725b0c8163 xsk: allow core/drivers to test EOP bit b7f72a30e9ac xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path 804627751b42 xsk: add support for AF_XDP multi-buffer on Rx path faa91b839b09 xsk: move xdp_buff's data length check to xsk_rcv_check 556444c4e683 xsk: prepare both copy and zero-copy modes to co-exist 81470b5c3c66 xsk: introduce XSK_USE_SG bind flag for xsk socket 63a64a56bc3f xsk: prepare 'options' in xdp_desc for multi-buffer use 458f72723412 xsk: Remove explicit_free parameter from __xsk_rcv() a6e944f25cdb xsk: Fix generic transmit when completion queue reservation fails 9c8f21e6f885 xsk: Build skb by page (aka generic zerocopy xmit) c2ff53d8049f net: Add priv_flags for allow tx skb without linear 0165cc817075 bpf: introduce bpf_xdp_get_buff_len helper 2e88d4ff0301 xdp: introduce flags field in xdp_buff/xdp_frame d16697cb6261 net: skbuff: add size metadata to skb_shared_info for xdp 9349eb3a9d2a xsk: Introduce batched Tx descriptor interfaces
Signed-off-by: Yue Haibing yuehaibing@huawei.com --- include/linux/netdevice.h | 7 ++ include/linux/skbuff.h | 6 ++ include/net/xdp.h | 42 ++++++++ include/net/xdp_sock.h | 20 ++++ include/net/xdp_sock_drv.h | 14 +++ include/net/xsk_buff_pool.h | 7 ++ include/uapi/linux/if_xdp.h | 13 +++ net/core/dev.c | 210 ++++++++++++++++++++++++++++++++++++ net/core/filter.c | 4 + net/core/skbuff.c | 90 ++++++++++++++++ net/xdp/Kconfig | 8 ++ net/xdp/xsk.c | 168 +++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 62 +++++++++++ 13 files changed, 651 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c765abd56c62..87fc83328987 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1620,6 +1620,8 @@ struct net_device_ops { * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running + * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with + * skb_headlen(skb) == 0 (data starts from frag0) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1653,6 +1655,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, + IFF_TX_SKB_NO_LINEAR = 1<<31, };
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1685,6 +1688,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK +#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR
/* Specifies the type of the struct net_device::ml_priv pointer */ enum netdev_ml_priv_type { @@ -3953,6 +3957,9 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); +#ifdef CONFIG_XSK_MULTI_BUF +int do_xdp_generic_multi(struct bpf_prog *xdp_prog, struct sk_buff **pskb); +#endif int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_rx_any_context(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95f0a69225fc..4a7a2ff7aec3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -530,6 +530,9 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; +#ifdef CONFIG_XSK_MULTI_BUF + KABI_FILL_HOLE(unsigned int xdp_frags_size) +#endif
/* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ @@ -3110,6 +3113,9 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); }
+#ifdef CONFIG_XSK_MULTI_BUF +int skb_cow_data_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog); +#endif /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment diff --git a/include/net/xdp.h b/include/net/xdp.h index a0d723fb032d..1461be3d8ef5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -71,6 +71,12 @@ struct xdp_txq_info { struct net_device *dev; };
+#ifdef CONFIG_XSK_MULTI_BUF +enum xdp_buff_flags { + XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ +}; +#endif + struct xdp_buff { void *data; void *data_end; @@ -79,8 +85,28 @@ struct xdp_buff { struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ +#ifdef CONFIG_XSK_MULTI_BUF + KABI_FILL_HOLE(u32 flags) /* supported values defined in xdp_buff_flags */ +#endif };
+#ifdef CONFIG_XSK_MULTI_BUF +static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); +} + +static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_HAS_FRAGS; +} + +static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; +} +#endif + /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the @@ -97,6 +123,22 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp) return (struct skb_shared_info *)xdp_data_hard_end(xdp); }
+#ifdef CONFIG_XSK_MULTI_BUF +static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +{ + unsigned int len = xdp->data_end - xdp->data; + struct skb_shared_info *sinfo; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + len += sinfo->xdp_frags_size; +out: + return len; +} +#endif + struct xdp_frame { void *data; u16 len; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index cc17bc957548..aab493e68d71 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -43,6 +43,13 @@ struct xsk_map { struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; +#ifdef CONFIG_XSK_MULTI_BUF + /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current + * packet, the partially built skb is saved here so that packet building can resume in next + * call of __xsk_generic_xmit(). + */ + KABI_FILL_HOLE(struct sk_buff *skb) +#endif struct xsk_queue *rx ____cacheline_aligned_in_smp; struct net_device *dev; struct xdp_umem *umem; @@ -50,6 +57,9 @@ struct xdp_sock { struct xsk_buff_pool *pool; u16 queue_id; bool zc; +#ifdef CONFIG_XSK_MULTI_BUF + KABI_FILL_HOLE(bool sg) +#endif enum { XSK_READY = 0, XSK_BOUND, @@ -77,6 +87,9 @@ struct xdp_sock { #ifdef CONFIG_XDP_SOCKETS
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +#ifdef CONFIG_XSK_MULTI_BUF +int xsk_generic_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp); +#endif int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void);
@@ -100,6 +113,13 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -ENOTSUPP; }
+#ifdef CONFIG_XSK_MULTI_BUF +static inline int xsk_generic_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} +#endif + static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 5b1ee8a9976d..4bad49c950c0 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -76,6 +76,13 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); }
+#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return !xp_mb_desc(desc); +} +#endif + static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return xp_can_alloc(pool, count); @@ -205,6 +212,13 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; }
+#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return false; +} +#endif + static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 40fa8e1d485a..d9df0e8e8e84 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -157,6 +157,13 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); }
+#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xp_mb_desc(struct xdp_desc *desc) +{ + return desc->options & XDP_PKT_CONTD; +} +#endif + static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) { return addr & pool->chunk_mask; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index a78a8096f4ce..8d48863472b9 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,12 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling AF_XDP to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped. + */ +#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) @@ -108,4 +114,11 @@ struct xdp_desc {
/* UMEM descriptor is __u64 */
+/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ +#define XDP_PKT_CONTD (1 << 0) + #endif /* _LINUX_IF_XDP_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 8e0f4690e157..3807c708d357 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4643,6 +4643,216 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) return rxqueue; }
+#ifdef CONFIG_XSK_MULTI_BUF +static u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct netdev_rx_queue *rxqueue; + void *orig_data, *orig_data_end; + bool orig_bcast, orig_host; + __be16 orig_eth_type; + struct ethhdr *eth; + u32 metalen, act; + int hlen, off; + u32 mac_len; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp->data = skb->data - mac_len; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + hlen; + xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } + + orig_data_end = xdp->data_end; + orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; + + rxqueue = netif_get_rxqueue(skb); + xdp->rxq = &rxqueue->xdp_rxq; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + + /* check if bpf_xdp_adjust_head was used */ + off = xdp->data - orig_data; + if (off) { + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + skb->mac_header += off; + skb_reset_network_header(skb); + } + + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; + if (off != 0) { + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); + skb->len += off; /* positive on grow, negative on shrink */ + } + + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + } + + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + __skb_push(skb, mac_len); + break; + case XDP_PASS: + metalen = xdp->data - xdp->data_meta; + if (metalen) + skb_metadata_set(skb, metalen); + break; + } + + return act; +} + +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + if (!skb_cow_data_for_xdp(pskb, prog)) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp_multi(struct sk_buff **pskb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. + */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + if (netif_skb_check_for_xdp(pskb, xdp_prog)) + goto do_drop; + } + + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception((*pskb)->dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + do_drop: + kfree_skb(*pskb); + break; + } + + return act; +} + +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); +int do_xdp_generic_multi(struct bpf_prog *xdp_prog, struct sk_buff **pskb) +{ + if (xdp_prog) { + struct xdp_buff xdp; + u32 act; + int err; + + act = netif_receive_generic_xdp_multi(pskb, &xdp, xdp_prog); + if (act != XDP_PASS) { + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, + &xdp, xdp_prog); + if (err) + goto out_redir; + break; + case XDP_TX: + generic_xdp_tx(*pskb, xdp_prog); + break; + } + return XDP_DROP; + } + } + return XDP_PASS; +out_redir: + kfree_skb(*pskb); + return XDP_DROP; +} +EXPORT_SYMBOL_GPL(do_xdp_generic_multi); +#endif + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) diff --git a/net/core/filter.c b/net/core/filter.c index a4e94a3e8c76..a4d96f0231b2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4102,7 +4102,11 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { struct xdp_sock *xs = fwd;
+#ifdef CONFIG_XSK_MULTI_BUF + err = xs->sg ? xsk_generic_rcv_multi(xs, xdp) : xsk_generic_rcv(xs, xdp); +#else err = xsk_generic_rcv(xs, xdp); +#endif if (err) goto err; consume_skb(skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index de0229b8a920..9b7dfa0314fa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4713,6 +4713,96 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) } EXPORT_SYMBOL_GPL(skb_cow_data);
+#ifdef CONFIG_XSK_MULTI_BUF +#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +static int skb_pp_cow_data(struct sk_buff **pskb, unsigned int headroom) +{ + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + struct page *page; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + page = alloc_page(GFP_ATOMIC); + data = page ? page_address(page) : NULL; + if (!data) + return -ENOMEM; + + nskb = build_skb(data, truesize); + if (!nskb) { + __free_page(page); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off = 0; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = alloc_page(GFP_ATOMIC); + if (!page) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +} + +int skb_cow_data_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + //if (!prog->aux->xdp_has_frags) + // return -EINVAL; + + return skb_pp_cow_data(pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); +#endif + static void sock_rmem_free(struct sk_buff *skb) { struct sock *sk = skb->sk; diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig index 71af2febe72a..7c6306f67f68 100644 --- a/net/xdp/Kconfig +++ b/net/xdp/Kconfig @@ -14,3 +14,11 @@ config XDP_SOCKETS_DIAG help Support for PF_XDP sockets monitoring interface used by the ss tool. If unsure, say Y. + +config XSK_MULTI_BUF + bool "Support generic xdp xsk multi-buffer" + depends on XDP_SOCKETS + default n + help + Support for PF_XDP sockets multi-buffer. + If unsure, say Y. diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 562d69f17b4c..7840b802c2fc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -211,6 +211,129 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, return 0; }
+#ifdef CONFIG_XSK_MULTI_BUF +static int __xsk_rcv_zc_multi(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) +{ + u64 addr; + int err; + + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc_op(xs->rx, addr, len, flags); + if (err) { + xs->rx_queue_full++; + return err; + } + + xp_release(xskb); + return 0; +} + +static void *xsk_copy_xdp_start(struct xdp_buff *from) +{ + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} + +static u32 xsk_copy_xdp_multi(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; + struct xdp_buff *xsk_xdp; + skb_frag_t *frag; + + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc_multi(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; + } + + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { + xs->rx_dropped++; + return -ENOSPC; + } + + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; + } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp_multi(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc_multi(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + + return 0; +} +#endif + static bool xsk_tx_writeable(struct xdp_sock *xs) { if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) @@ -229,6 +352,40 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; }
+#ifdef CONFIG_XSK_MULTI_BUF +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + if (!xsk_is_bound(xs)) + return -EINVAL; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } + + return 0; +} + +static void xsk_flush(struct xdp_sock *xs); +int xsk_generic_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len = xdp_get_buff_len(xdp); + int err; + + spin_lock_bh(&xs->rx_lock); + err = xsk_rcv_check(xs, xdp, len); + if (!err) { + err = __xsk_rcv_multi(xs, xdp, len); + xsk_flush(xs); + } + spin_unlock_bh(&xs->rx_lock); + return err; +} +#endif + static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, bool explicit_free) { @@ -678,7 +835,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | +#ifdef CONFIG_XSK_MULTI_BUF + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) +#else XDP_USE_NEED_WAKEUP)) +#endif return -EINVAL;
bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); @@ -710,7 +871,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock;
if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || +#ifdef CONFIG_XSK_MULTI_BUF + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { +#else (flags & XDP_USE_NEED_WAKEUP)) { +#endif /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -796,6 +961,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xs->dev = dev; xs->zc = xs->umem->zc; +#ifdef CONFIG_XSK_MULTI_BUF + xs->sg = !!(flags & XDP_USE_SG); +#endif xs->queue_id = qid; xp_add_xsk(xs->pool, xs);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a76d43787549..75d6fe2ae32d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -123,6 +123,13 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; }
+#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} +#endif + static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { @@ -138,7 +145,11 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, if (chunk >= pool->addrs_cnt) return false;
+#ifdef CONFIG_XSK_MULTI_BUF + if (xp_unused_options_set(desc->options)) +#else if (desc->options) +#endif return false; return true; } @@ -159,7 +170,11 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) return false;
+#ifdef CONFIG_XSK_MULTI_BUF + if (xp_unused_options_set(desc->options)) +#else if (desc->options) +#endif return false; return true; } @@ -275,6 +290,32 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
/* Functions for producers */
+#ifdef CONFIG_XSK_MULTI_BUF +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) +{ + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + if (free_entries >= max) + return max; + + /* Refresh the local tail pointer */ + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; +} + +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_prod -= cnt; +} + +#else static inline bool xskq_prod_is_full(struct xsk_queue *q) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -288,6 +329,7 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q)
return !free_entries; } +#endif
static inline void xskq_prod_cancel(struct xsk_queue *q) { @@ -316,6 +358,26 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; }
+#ifdef CONFIG_XSK_MULTI_BUF +static inline int xskq_prod_reserve_desc_op(struct xsk_queue *q, + u64 addr, u32 len, u32 flags) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx; + + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + idx = q->cached_prod++ & q->ring_mask; + ring->desc[idx].addr = addr; + ring->desc[idx].len = len; + ring->desc[idx].options = flags; + + return 0; +} +#endif + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) {
From: bitcoffee liuxin350@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAOZOH?from=project-issue
----------------------------------------------
Currently, the ipvlan doesn't support the xdp native mode. We want to add the xdp native mode to the ipvlan in a simple way.
Because the working mode of ipvlan is special, it doesn't have the concept of queue. Therefore, the packet data received by the ipvlan is sent to the same xsk, and the function is processed from the soft interrupt of the real dev. Because xdp_do_redirect/xdp_do_flush are all lock-free mode, if xdp_do_redirect and xdp_do_flush are used on the ipvlan, different packet receiving queues(from different cpus) write data to the same xsk, the concurrency problem directly causes the kernel to crash.
To fix this problem, we need to change the current packet receiving mode of the ipvlan, or extend the xdp redirection and flush to provide locked version. However, in fact, do_xdp_generic is already a locked version, so using do_xdp_generic directly meets our requirements, compared with running on xdp generic mode. Advance do_xdp_generic to the NIC driver layer to reduce the invoking of a software interrupt and slightly improve the performance. Each time an ipvlan is entered, a single SKB is processed. In fact, multiple SKB of do_xdp_flush in the ipvlane cannot be refreshed at the same time.
Signed-off-by: bitcoffee liuxin350@huawei.com --- drivers/net/ipvlan/ipvlan.h | 1 + drivers/net/ipvlan/ipvlan_core.c | 16 ++++++++++++++++ drivers/net/ipvlan/ipvlan_main.c | 22 ++++++++++++++++++++++ 3 files changed, 39 insertions(+)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 6796e742c470..11d81fb63cdb 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -77,6 +77,7 @@ struct ipvl_dev { unsigned long local_timeout; struct timer_list local_free_timer; struct sk_buff_head local_xmit_queue; + struct bpf_prog __rcu *xdp_prog; };
struct ipvl_addr { diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index d5ae4fd9f1c0..7c40580ad5a9 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -314,11 +314,27 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, { struct ipvl_dev *ipvlan = addr->master; struct net_device *dev = ipvlan->dev; + struct bpf_prog *xdp_prog = rtnl_dereference(ipvlan->xdp_prog); unsigned int len; rx_handler_result_t ret = RX_HANDLER_CONSUMED; bool success = false; struct sk_buff *skb = *pskb; + struct net_device *old_dev = skb->dev; + int xdp_ret; + + if (!xdp_prog) + goto go_network_stack; + skb->dev = dev; +#ifdef CONFIG_XSK_MULTI_BUF + xdp_ret = do_xdp_generic_multi(xdp_prog, &skb); +#else + xdp_ret = do_xdp_generic(xdp_prog, skb); +#endif + if (xdp_ret != XDP_PASS) + return ret; + skb->dev = old_dev;
+go_network_stack: len = skb->len + ETH_HLEN; /* Only packets exchanged between two local slaves need to have * device-up check as well as skb-share check. diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 503e9c0995fb..0b9d74ea3688 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -424,6 +424,27 @@ static int ipvlan_get_iflink(const struct net_device *dev) return ipvlan->phy_dev->ifindex; }
+static int ipvlan_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct ipvl_dev *priv = netdev_priv(dev); + struct bpf_prog *old_prog; + + old_prog = rtnl_dereference(priv->xdp_prog); + rcu_assign_pointer(priv->xdp_prog, prog); + return 0; +} + +static int ipvlan_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return ipvlan_xdp_set(dev, xdp->prog, xdp->extack); + default: + return -EINVAL; + } +} + static const struct net_device_ops ipvlan_netdev_ops = { .ndo_init = ipvlan_init, .ndo_uninit = ipvlan_uninit, @@ -437,6 +458,7 @@ static const struct net_device_ops ipvlan_netdev_ops = { .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, .ndo_get_iflink = ipvlan_get_iflink, + .ndo_bpf = ipvlan_xdp, };
static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev,