 
            hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAOZOH?from=project-issue ---------------------------------------------- This is partialy backport from mainline e6d5dbdd20aa xdp: add multi-buff support for xdp running in generic mode c6a28acb1a27 net: fix pointer check in skb_pp_cow_data routine 4d2bb0bfe874 xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp fe21cb91ae7b net: core: Split out code to run generic XDP prog cf24f5a5feea xsk: add support for AF_XDP multi-buffer on Tx path 2127c6043836 xsk: Add truesize to skb_add_rx_frag(). 9d0a67b9d42c xsk: Fix xsk_build_skb() error: 'skb' dereferencing possible ERR_PTR() 1b725b0c8163 xsk: allow core/drivers to test EOP bit b7f72a30e9ac xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path 804627751b42 xsk: add support for AF_XDP multi-buffer on Rx path faa91b839b09 xsk: move xdp_buff's data length check to xsk_rcv_check 556444c4e683 xsk: prepare both copy and zero-copy modes to co-exist 81470b5c3c66 xsk: introduce XSK_USE_SG bind flag for xsk socket 63a64a56bc3f xsk: prepare 'options' in xdp_desc for multi-buffer use 458f72723412 xsk: Remove explicit_free parameter from __xsk_rcv() a6e944f25cdb xsk: Fix generic transmit when completion queue reservation fails 9c8f21e6f885 xsk: Build skb by page (aka generic zerocopy xmit) c2ff53d8049f net: Add priv_flags for allow tx skb without linear 0165cc817075 bpf: introduce bpf_xdp_get_buff_len helper 2e88d4ff0301 xdp: introduce flags field in xdp_buff/xdp_frame d16697cb6261 net: skbuff: add size metadata to skb_shared_info for xdp 9349eb3a9d2a xsk: Introduce batched Tx descriptor interfaces Signed-off-by: Yue Haibing <yuehaibing@huawei.com> --- include/linux/netdevice.h | 7 + include/linux/skbuff.h | 6 + include/net/xdp.h | 42 ++++ include/net/xdp_sock.h | 20 ++ include/net/xdp_sock_drv.h | 14 ++ include/net/xsk_buff_pool.h | 7 + include/uapi/linux/if_xdp.h | 13 ++ net/core/dev.c | 210 +++++++++++++++++++ net/core/filter.c | 4 + net/core/skbuff.c | 90 ++++++++ net/xdp/Kconfig | 8 + net/xdp/xsk.c | 405 ++++++++++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 105 ++++++++++ 13 files changed, 931 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c765abd56c62..87fc83328987 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1620,6 +1620,8 @@ struct net_device_ops { * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running + * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with + * skb_headlen(skb) == 0 (data starts from frag0) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1653,6 +1655,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, + IFF_TX_SKB_NO_LINEAR = 1<<31, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1685,6 +1688,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK +#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /* Specifies the type of the struct net_device::ml_priv pointer */ enum netdev_ml_priv_type { @@ -3953,6 +3957,9 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); +#ifdef CONFIG_XSK_MULTI_BUF +int do_xdp_generic_multi(struct bpf_prog *xdp_prog, struct sk_buff **pskb); +#endif int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_rx_any_context(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95f0a69225fc..4a7a2ff7aec3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -530,6 +530,9 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; +#ifdef CONFIG_XSK_MULTI_BUF + KABI_FILL_HOLE(unsigned int xdp_frags_size) +#endif /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ @@ -3110,6 +3113,9 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +#ifdef CONFIG_XSK_MULTI_BUF +int skb_cow_data_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog); +#endif /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment diff --git a/include/net/xdp.h b/include/net/xdp.h index a0d723fb032d..1461be3d8ef5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -71,6 +71,12 @@ struct xdp_txq_info { struct net_device *dev; }; +#ifdef CONFIG_XSK_MULTI_BUF +enum xdp_buff_flags { + XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ +}; +#endif + struct xdp_buff { void *data; void *data_end; @@ -79,8 +85,28 @@ struct xdp_buff { struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ +#ifdef CONFIG_XSK_MULTI_BUF + KABI_FILL_HOLE(u32 flags) /* supported values defined in xdp_buff_flags */ +#endif }; +#ifdef CONFIG_XSK_MULTI_BUF +static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); +} + +static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_HAS_FRAGS; +} + +static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; +} +#endif + /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the @@ -97,6 +123,22 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp) return (struct skb_shared_info *)xdp_data_hard_end(xdp); } +#ifdef CONFIG_XSK_MULTI_BUF +static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +{ + unsigned int len = xdp->data_end - xdp->data; + struct skb_shared_info *sinfo; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + len += sinfo->xdp_frags_size; +out: + return len; +} +#endif + struct xdp_frame { void *data; u16 len; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index cc17bc957548..aab493e68d71 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -43,6 +43,13 @@ struct xsk_map { struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; +#ifdef CONFIG_XSK_MULTI_BUF + /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current + * packet, the partially built skb is saved here so that packet building can resume in next + * call of __xsk_generic_xmit(). + */ + KABI_FILL_HOLE(struct sk_buff *skb) +#endif struct xsk_queue *rx ____cacheline_aligned_in_smp; struct net_device *dev; struct xdp_umem *umem; @@ -50,6 +57,9 @@ struct xdp_sock { struct xsk_buff_pool *pool; u16 queue_id; bool zc; +#ifdef CONFIG_XSK_MULTI_BUF + KABI_FILL_HOLE(bool sg) +#endif enum { XSK_READY = 0, XSK_BOUND, @@ -77,6 +87,9 @@ struct xdp_sock { #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +#ifdef CONFIG_XSK_MULTI_BUF +int xsk_generic_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp); +#endif int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); @@ -100,6 +113,13 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -ENOTSUPP; } +#ifdef CONFIG_XSK_MULTI_BUF +static inline int xsk_generic_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} +#endif + static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 5b1ee8a9976d..4bad49c950c0 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -76,6 +76,13 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } +#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return !xp_mb_desc(desc); +} +#endif + static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return xp_can_alloc(pool, count); @@ -205,6 +212,13 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } +#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return false; +} +#endif + static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 40fa8e1d485a..d9df0e8e8e84 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -157,6 +157,13 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } +#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xp_mb_desc(struct xdp_desc *desc) +{ + return desc->options & XDP_PKT_CONTD; +} +#endif + static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) { return addr & pool->chunk_mask; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index a78a8096f4ce..8d48863472b9 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,12 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling AF_XDP to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped. + */ +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) @@ -108,4 +114,11 @@ struct xdp_desc { /* UMEM descriptor is __u64 */ +/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ +#define XDP_PKT_CONTD (1 << 0) + #endif /* _LINUX_IF_XDP_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 8e0f4690e157..3807c708d357 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4643,6 +4643,216 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) return rxqueue; } +#ifdef CONFIG_XSK_MULTI_BUF +static u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct netdev_rx_queue *rxqueue; + void *orig_data, *orig_data_end; + bool orig_bcast, orig_host; + __be16 orig_eth_type; + struct ethhdr *eth; + u32 metalen, act; + int hlen, off; + u32 mac_len; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp->data = skb->data - mac_len; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + hlen; + xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } + + orig_data_end = xdp->data_end; + orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; + + rxqueue = netif_get_rxqueue(skb); + xdp->rxq = &rxqueue->xdp_rxq; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + + /* check if bpf_xdp_adjust_head was used */ + off = xdp->data - orig_data; + if (off) { + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + skb->mac_header += off; + skb_reset_network_header(skb); + } + + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; + if (off != 0) { + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); + skb->len += off; /* positive on grow, negative on shrink */ + } + + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + } + + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + __skb_push(skb, mac_len); + break; + case XDP_PASS: + metalen = xdp->data - xdp->data_meta; + if (metalen) + skb_metadata_set(skb, metalen); + break; + } + + return act; +} + +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + if (!skb_cow_data_for_xdp(pskb, prog)) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp_multi(struct sk_buff **pskb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. + */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + if (netif_skb_check_for_xdp(pskb, xdp_prog)) + goto do_drop; + } + + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception((*pskb)->dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + do_drop: + kfree_skb(*pskb); + break; + } + + return act; +} + +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); +int do_xdp_generic_multi(struct bpf_prog *xdp_prog, struct sk_buff **pskb) +{ + if (xdp_prog) { + struct xdp_buff xdp; + u32 act; + int err; + + act = netif_receive_generic_xdp_multi(pskb, &xdp, xdp_prog); + if (act != XDP_PASS) { + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, + &xdp, xdp_prog); + if (err) + goto out_redir; + break; + case XDP_TX: + generic_xdp_tx(*pskb, xdp_prog); + break; + } + return XDP_DROP; + } + } + return XDP_PASS; +out_redir: + kfree_skb(*pskb); + return XDP_DROP; +} +EXPORT_SYMBOL_GPL(do_xdp_generic_multi); +#endif + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) diff --git a/net/core/filter.c b/net/core/filter.c index a4e94a3e8c76..a4d96f0231b2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4102,7 +4102,11 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { struct xdp_sock *xs = fwd; +#ifdef CONFIG_XSK_MULTI_BUF + err = xs->sg ? xsk_generic_rcv_multi(xs, xdp) : xsk_generic_rcv(xs, xdp); +#else err = xsk_generic_rcv(xs, xdp); +#endif if (err) goto err; consume_skb(skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index de0229b8a920..9b7dfa0314fa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4713,6 +4713,96 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) } EXPORT_SYMBOL_GPL(skb_cow_data); +#ifdef CONFIG_XSK_MULTI_BUF +#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +static int skb_pp_cow_data(struct sk_buff **pskb, unsigned int headroom) +{ + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + struct page *page; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + page = alloc_page(GFP_ATOMIC); + data = page ? page_address(page) : NULL; + if (!data) + return -ENOMEM; + + nskb = build_skb(data, truesize); + if (!nskb) { + __free_page(page); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off = 0; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = alloc_page(GFP_ATOMIC); + if (!page) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +} + +int skb_cow_data_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + //if (!prog->aux->xdp_has_frags) + // return -EINVAL; + + return skb_pp_cow_data(pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); +#endif + static void sock_rmem_free(struct sk_buff *skb) { struct sock *sk = skb->sk; diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig index 71af2febe72a..7c6306f67f68 100644 --- a/net/xdp/Kconfig +++ b/net/xdp/Kconfig @@ -14,3 +14,11 @@ config XDP_SOCKETS_DIAG help Support for PF_XDP sockets monitoring interface used by the ss tool. If unsure, say Y. + +config XSK_MULTI_BUF + bool "Support generic xdp xsk multi-buffer" + depends on XDP_SOCKETS + default n + help + Support for PF_XDP sockets multi-buffer. + If unsure, say Y. diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 562d69f17b4c..be788e28f9d1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -211,6 +211,129 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, return 0; } +#ifdef CONFIG_XSK_MULTI_BUF +static int __xsk_rcv_zc_multi(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) +{ + u64 addr; + int err; + + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc_op(xs->rx, addr, len, flags); + if (err) { + xs->rx_queue_full++; + return err; + } + + xp_release(xskb); + return 0; +} + +static void *xsk_copy_xdp_start(struct xdp_buff *from) +{ + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} + +static u32 xsk_copy_xdp_multi(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; + struct xdp_buff *xsk_xdp; + skb_frag_t *frag; + + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc_multi(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; + } + + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { + xs->rx_dropped++; + return -ENOSPC; + } + + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; + } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp_multi(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc_multi(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + + return 0; +} +#endif + static bool xsk_tx_writeable(struct xdp_sock *xs) { if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) @@ -229,6 +352,40 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } +#ifdef CONFIG_XSK_MULTI_BUF +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + if (!xsk_is_bound(xs)) + return -EINVAL; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } + + return 0; +} + +static void xsk_flush(struct xdp_sock *xs); +int xsk_generic_rcv_multi(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len = xdp_get_buff_len(xdp); + int err; + + spin_lock_bh(&xs->rx_lock); + err = xsk_rcv_check(xs, xdp, len); + if (!err) { + err = __xsk_rcv_multi(xs, xdp, len); + xsk_flush(xs); + } + spin_unlock_bh(&xs->rx_lock); + return err; +} +#endif + static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, bool explicit_free) { @@ -371,6 +528,231 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } +#ifdef CONFIG_XSK_MULTI_BUF +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(xs->pool->cq, addr); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + + return ret; +} + +static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_submit_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +} + +static void xsk_destruct_skb_multi(struct sk_buff *skb) +{ + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); + sock_wfree(skb); +} + +static void xsk_set_destructor_arg(struct sk_buff *skb) +{ + long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + + skb_shinfo(skb)->destructor_arg = (void *)num; +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct net_device *dev = xs->dev; + struct sk_buff *skb = xs->skb; + int err; + + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { + err = -ENOTSUPP; + goto free_err; + } else { + u32 hr, tr, len; + void *buffer; + + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); + len = desc->len; + + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; + + skb_reserve(skb, hr); + skb_put(skb, len); + + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + goto free_err; + } + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct page *page; + u8 *vaddr; + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EOVERFLOW; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap(page); + memcpy(vaddr, buffer, len); + kunmap(page); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); + refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); + } + } + + skb->dev = dev; + skb->priority = xs->sk.sk_priority; + skb->mark = xs->sk.sk_mark; + skb->destructor = xsk_destruct_skb_multi; + xsk_set_destructor_arg(skb); + + return skb; +free_err: + if (err == -EOVERFLOW) { + /* Drop the packet */ + xsk_set_destructor_arg(xs->skb); + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } else { + /* Let application retry */ + xsk_cq_cancel_locked(xs, 1); + } + + return ERR_PTR(err); +} + +static int xsk_generic_xmit_multi(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + u32 max_batch = TX_BATCH_SIZE; + bool sent_frame = false; + struct xdp_desc desc; + struct sk_buff *skb; + int err = 0; + + mutex_lock(&xs->mutex); + + if (xs->queue_id >= xs->dev->real_num_tx_queues) + goto out; + + while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { + if (max_batch-- == 0) { + err = -EAGAIN; + goto out; + } + + /* This is the backpressure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ + if (xsk_cq_reserve_addr_locked(xs, desc.addr)) + goto out; + + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + if (err != -EOVERFLOW) + goto out; + err = 0; + continue; + } + + xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; + } + + err = __dev_direct_xmit(skb, xs->queue_id); + if (err == NETDEV_TX_BUSY) { + /* Tell user-space to retry the send */ + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); + xsk_consume_skb(skb); + err = -EAGAIN; + goto out; + } + + /* Ignore NET_XMIT_CN as packet might have been sent */ + if (err == NET_XMIT_DROP) { + /* SKB completed but not sent */ + err = -EBUSY; + xs->skb = NULL; + goto out; + } + + sent_frame = true; + xs->skb = NULL; + } + + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } + +out: + if (sent_frame) + if (xsk_tx_writeable(xs)) + sk->sk_write_space(sk); + + mutex_unlock(&xs->mutex); + return err; +} +#endif + static int xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); @@ -474,7 +856,14 @@ static int __xsk_sendmsg(struct sock *sk) if (unlikely(!xs->tx)) return -ENOBUFS; +#ifdef CONFIG_XSK_MULTI_BUF + if (xs->zc) + return xsk_zc_xmit(xs); + else + return xs->sg ? xsk_generic_xmit_multi(sk) : xsk_generic_xmit(sk); +#else return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); +#endif } static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) @@ -612,6 +1001,11 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); +#ifdef CONFIG_XSK_MULTI_BUF + if (xs->skb) + xsk_drop_skb(xs->skb); +#endif + mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); @@ -678,7 +1072,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | +#ifdef CONFIG_XSK_MULTI_BUF + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) +#else XDP_USE_NEED_WAKEUP)) +#endif return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); @@ -710,7 +1108,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || +#ifdef CONFIG_XSK_MULTI_BUF + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { +#else (flags & XDP_USE_NEED_WAKEUP)) { +#endif /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -796,6 +1198,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; +#ifdef CONFIG_XSK_MULTI_BUF + xs->sg = !!(flags & XDP_USE_SG); +#endif xs->queue_id = qid; xp_add_xsk(xs->pool, xs); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a76d43787549..84ee9bf94f71 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -123,6 +123,13 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} +#endif + static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { @@ -138,7 +145,11 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, if (chunk >= pool->addrs_cnt) return false; +#ifdef CONFIG_XSK_MULTI_BUF + if (xp_unused_options_set(desc->options)) +#else if (desc->options) +#endif return false; return true; } @@ -159,7 +170,11 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) return false; +#ifdef CONFIG_XSK_MULTI_BUF + if (xp_unused_options_set(desc->options)) +#else if (desc->options) +#endif return false; return true; } @@ -171,6 +186,13 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool, xp_aligned_validate_desc(pool, desc); } +#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xskq_has_descs(struct xsk_queue *q) +{ + return q->cached_cons != q->cached_prod; +} +#endif + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) @@ -182,6 +204,24 @@ static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, return true; } +#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xskq_cons_read_desc_multi(struct xsk_queue *q, + struct xdp_desc *desc, + struct xsk_buff_pool *pool) +{ + if (q->cached_cons != q->cached_prod) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = q->cached_cons & q->ring_mask; + + *desc = ring->desc[idx]; + return xskq_cons_is_valid_desc(q, desc, pool); + } + + q->queue_empty_descs++; + return false; +} +#endif + static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) @@ -241,6 +281,17 @@ static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) return xskq_cons_read_addr_unchecked(q, addr); } +#ifdef CONFIG_XSK_MULTI_BUF +static inline bool xskq_cons_peek_desc_multi(struct xsk_queue *q, + struct xdp_desc *desc, + struct xsk_buff_pool *pool) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_desc_multi(q, desc, pool); +} +#endif + static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) @@ -267,6 +318,13 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) q->nentries; } +#ifdef CONFIG_XSK_MULTI_BUF +static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons -= cnt; +} +#endif + static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -275,6 +333,32 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q) /* Functions for producers */ +#ifdef CONFIG_XSK_MULTI_BUF +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) +{ + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + if (free_entries >= max) + return max; + + /* Refresh the local tail pointer */ + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; +} + +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_prod -= cnt; +} + +#else static inline bool xskq_prod_is_full(struct xsk_queue *q) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -288,6 +372,7 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return !free_entries; } +#endif static inline void xskq_prod_cancel(struct xsk_queue *q) { @@ -316,6 +401,26 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +#ifdef CONFIG_XSK_MULTI_BUF +static inline int xskq_prod_reserve_desc_op(struct xsk_queue *q, + u64 addr, u32 len, u32 flags) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx; + + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + idx = q->cached_prod++ & q->ring_mask; + ring->desc[idx].addr = addr; + ring->desc[idx].len = len; + ring->desc[idx].options = flags; + + return 0; +} +#endif + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { -- 2.34.1