This series includes some optimization in IO path for the HNS3 ethernet driver.
--- Change log: V1 -> V2: 1. use sgt->orig_nents instead of sgt->nents in patch #5 2. fix a type mismatch issue in patch #5
Huazhong Tan (1): net: hns3: add support to query tx spare buffer size for pf
Yunsheng Lin (6): net: hns3: minor refactor related to desc_cb handling net: hns3: refactor for hns3_fill_desc() function net: hns3: use tx bounce buffer for small packets net: hns3: support dma_map_sg() for multi frags skb net: hns3: optimize the rx page reuse handling process net: hns3: use bounce buffer when rx page can not be reused
drivers/net/ethernet/hisilicon/hns3/hnae3.h | 8 +- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 54 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 575 ++++++++++++++++++--- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 58 ++- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 66 +++ .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 2 + .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 + .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 + 8 files changed, 680 insertions(+), 99 deletions(-)
From: Yunsheng Lin linyunsheng@huawei.com
desc_cb is used to store mapping and freeing info for the corresponding desc, which is used in the cleaning process. There will be more desc_cb type coming up when supporting the tx bounce buffer, change desc_cb type to bit-wise value in order to reduce the desc_cb type checking operation in the data path.
Also move the desc_cb type definition to hns3_enet.h because it is only used in hns3_enet.c, and declare a local variable desc_cb in hns3_clear_desc() to reduce lines of code.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 7 ----- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 40 +++++++++++-------------- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 7 +++++ 3 files changed, 25 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index ba883b0..5822fc0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -159,13 +159,6 @@ enum HNAE3_PF_CAP_BITS { #define ring_ptr_move_bw(ring, p) \ ((ring)->p = ((ring)->p - 1 + (ring)->desc_num) % (ring)->desc_num)
-enum hns_desc_type { - DESC_TYPE_UNKNOWN, - DESC_TYPE_SKB, - DESC_TYPE_FRAGLIST_SKB, - DESC_TYPE_PAGE, -}; - struct hnae3_handle;
struct hnae3_queue { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 9a45f3c..f03a7a9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1413,7 +1413,7 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, }
static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, - unsigned int size, enum hns_desc_type type) + unsigned int size, unsigned int type) { #define HNS3_LIKELY_BD_NUM 1
@@ -1425,8 +1425,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, int k, sizeoflast; dma_addr_t dma;
- if (type == DESC_TYPE_FRAGLIST_SKB || - type == DESC_TYPE_SKB) { + if (type & (DESC_TYPE_FRAGLIST_SKB | DESC_TYPE_SKB)) { struct sk_buff *skb = (struct sk_buff *)priv;
dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE); @@ -1704,6 +1703,7 @@ static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig)
for (i = 0; i < ring->desc_num; i++) { struct hns3_desc *desc = &ring->desc[ring->next_to_use]; + struct hns3_desc_cb *desc_cb;
memset(desc, 0, sizeof(*desc));
@@ -1714,31 +1714,27 @@ static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig) /* rollback one */ ring_ptr_move_bw(ring, next_to_use);
- if (!ring->desc_cb[ring->next_to_use].dma) + desc_cb = &ring->desc_cb[ring->next_to_use]; + + if (!desc_cb->dma) continue;
/* unmap the descriptor dma address */ - if (ring->desc_cb[ring->next_to_use].type == DESC_TYPE_SKB || - ring->desc_cb[ring->next_to_use].type == - DESC_TYPE_FRAGLIST_SKB) - dma_unmap_single(dev, - ring->desc_cb[ring->next_to_use].dma, - ring->desc_cb[ring->next_to_use].length, - DMA_TO_DEVICE); - else if (ring->desc_cb[ring->next_to_use].length) - dma_unmap_page(dev, - ring->desc_cb[ring->next_to_use].dma, - ring->desc_cb[ring->next_to_use].length, + if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) + dma_unmap_single(dev, desc_cb->dma, desc_cb->length, + DMA_TO_DEVICE); + else if (desc_cb->length) + dma_unmap_page(dev, desc_cb->dma, desc_cb->length, DMA_TO_DEVICE);
- ring->desc_cb[ring->next_to_use].length = 0; - ring->desc_cb[ring->next_to_use].dma = 0; - ring->desc_cb[ring->next_to_use].type = DESC_TYPE_UNKNOWN; + desc_cb->length = 0; + desc_cb->dma = 0; + desc_cb->type = DESC_TYPE_UNKNOWN; } }
static int hns3_fill_skb_to_desc(struct hns3_enet_ring *ring, - struct sk_buff *skb, enum hns_desc_type type) + struct sk_buff *skb, unsigned int type) { unsigned int size = skb_headlen(skb); struct sk_buff *frag_skb; @@ -2859,7 +2855,7 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring, static void hns3_free_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb, int budget) { - if (cb->type == DESC_TYPE_SKB) + if (cb->type & DESC_TYPE_SKB) napi_consume_skb(cb->priv, budget); else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias) __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); @@ -2880,7 +2876,7 @@ static int hns3_map_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb) static void hns3_unmap_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb) { - if (cb->type == DESC_TYPE_SKB || cb->type == DESC_TYPE_FRAGLIST_SKB) + if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) dma_unmap_single(ring_to_dev(ring), cb->dma, cb->length, ring_to_dma_dir(ring)); else if (cb->length) @@ -3037,7 +3033,7 @@ static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring,
desc_cb = &ring->desc_cb[ntc];
- if (desc_cb->type == DESC_TYPE_SKB) { + if (desc_cb->type & DESC_TYPE_SKB) { (*pkts)++; (*bytes) += desc_cb->send_bytes; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 79821c7..9d18b94 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -299,6 +299,13 @@ struct __packed hns3_desc { }; };
+enum hns3_desc_type { + DESC_TYPE_UNKNOWN = 0, + DESC_TYPE_SKB = 1 << 0, + DESC_TYPE_FRAGLIST_SKB = 1 << 1, + DESC_TYPE_PAGE = 1 << 2, +}; + struct hns3_desc_cb { dma_addr_t dma; /* dma address of this desc */ void *buf; /* cpu addr for a desc */
From: Yunsheng Lin linyunsheng@huawei.com
Factor out hns3_fill_desc() so that it can be reused in the tx bounce supporting.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 87 ++++++++++++++----------- 1 file changed, 48 insertions(+), 39 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index f03a7a9..6fa1ed5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1412,39 +1412,14 @@ static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, return 0; }
-static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, - unsigned int size, unsigned int type) +static int hns3_fill_desc(struct hns3_enet_ring *ring, dma_addr_t dma, + unsigned int size) { #define HNS3_LIKELY_BD_NUM 1
- struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; struct hns3_desc *desc = &ring->desc[ring->next_to_use]; - struct device *dev = ring_to_dev(ring); - skb_frag_t *frag; unsigned int frag_buf_num; int k, sizeoflast; - dma_addr_t dma; - - if (type & (DESC_TYPE_FRAGLIST_SKB | DESC_TYPE_SKB)) { - struct sk_buff *skb = (struct sk_buff *)priv; - - dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE); - } else { - frag = (skb_frag_t *)priv; - dma = skb_frag_dma_map(dev, frag, 0, size, DMA_TO_DEVICE); - } - - if (unlikely(dma_mapping_error(dev, dma))) { - u64_stats_update_begin(&ring->syncp); - ring->stats.sw_err_cnt++; - u64_stats_update_end(&ring->syncp); - return -ENOMEM; - } - - desc_cb->priv = priv; - desc_cb->length = size; - desc_cb->dma = dma; - desc_cb->type = type;
if (likely(size <= HNS3_MAX_BD_SIZE)) { desc->addr = cpu_to_le64(dma); @@ -1480,6 +1455,47 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, return frag_buf_num; }
+static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv, + unsigned int type) +{ + struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; + struct device *dev = ring_to_dev(ring); + unsigned int size; + dma_addr_t dma; + + if (type & (DESC_TYPE_FRAGLIST_SKB | DESC_TYPE_SKB)) { + struct sk_buff *skb = (struct sk_buff *)priv; + + size = skb_headlen(skb); + if (!size) + return 0; + + dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE); + } else { + skb_frag_t *frag = (skb_frag_t *)priv; + + size = skb_frag_size(frag); + if (!size) + return 0; + + dma = skb_frag_dma_map(dev, frag, 0, size, DMA_TO_DEVICE); + } + + if (unlikely(dma_mapping_error(dev, dma))) { + u64_stats_update_begin(&ring->syncp); + ring->stats.sw_err_cnt++; + u64_stats_update_end(&ring->syncp); + return -ENOMEM; + } + + desc_cb->priv = priv; + desc_cb->length = size; + desc_cb->dma = dma; + desc_cb->type = type; + + return hns3_fill_desc(ring, dma, size); +} + static unsigned int hns3_skb_bd_num(struct sk_buff *skb, unsigned int *bd_size, unsigned int bd_num) { @@ -1736,26 +1752,19 @@ static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig) static int hns3_fill_skb_to_desc(struct hns3_enet_ring *ring, struct sk_buff *skb, unsigned int type) { - unsigned int size = skb_headlen(skb); struct sk_buff *frag_skb; int i, ret, bd_num = 0;
- if (size) { - ret = hns3_fill_desc(ring, skb, size, type); - if (unlikely(ret < 0)) - return ret; + ret = hns3_map_and_fill_desc(ring, skb, type); + if (unlikely(ret < 0)) + return ret;
- bd_num += ret; - } + bd_num += ret;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- size = skb_frag_size(frag); - if (!size) - continue; - - ret = hns3_fill_desc(ring, frag, size, DESC_TYPE_PAGE); + ret = hns3_map_and_fill_desc(ring, frag, DESC_TYPE_PAGE); if (unlikely(ret < 0)) return ret;
From: Yunsheng Lin linyunsheng@huawei.com
when the packet or frag size is small, it causes both security and performance issue. As dma can't map sub-page, this means some extra kernel data is visible to devices. On the other hand, the overhead of dma map and unmap is huge when IOMMU is on.
So add a queue based tx shared bounce buffer to memcpy the small packet when the len of the xmitted skb is below tx_copybreak. Add tx_spare_buf_size module param to set the size of tx spare buffer, and add set/get_tunable to set or query the tx_copybreak.
The throughtput improves from 30 Gbps to 90+ Gbps when running 16 netperf threads with 32KB UDP message size when IOMMU is in the strict mode(tx_copybreak = 2000 and mtu = 1500).
Suggested-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 52 ++++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 289 ++++++++++++++++++++- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 43 ++- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 51 ++++ 4 files changed, 420 insertions(+), 15 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index c512a63..a24a75c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -392,6 +392,56 @@ static void hns3_dbg_fill_content(char *content, u16 len, *pos++ = '\0'; }
+static const struct hns3_dbg_item tx_spare_info_items[] = { + { "QUEUE_ID", 2 }, + { "COPYBREAK", 2 }, + { "LEN", 7 }, + { "NTU", 4 }, + { "NTC", 4 }, + { "LTC", 4 }, + { "DMA", 17 }, +}; + +static void hns3_dbg_tx_spare_info(struct hns3_enet_ring *ring, char *buf, + int len, u32 ring_num, int *pos) +{ + char data_str[ARRAY_SIZE(tx_spare_info_items)][HNS3_DBG_DATA_STR_LEN]; + struct hns3_tx_spare *tx_spare = ring->tx_spare; + char *result[ARRAY_SIZE(tx_spare_info_items)]; + char content[HNS3_DBG_INFO_LEN]; + u32 i, j; + + if (!tx_spare) { + *pos += scnprintf(buf + *pos, len - *pos, + "tx spare buffer is not enabled\n"); + return; + } + + for (i = 0; i < ARRAY_SIZE(tx_spare_info_items); i++) + result[i] = &data_str[i][0]; + + *pos += scnprintf(buf + *pos, len - *pos, "tx spare buffer info\n"); + hns3_dbg_fill_content(content, sizeof(content), tx_spare_info_items, + NULL, ARRAY_SIZE(tx_spare_info_items)); + *pos += scnprintf(buf + *pos, len - *pos, "%s", content); + + for (i = 0; i < ring_num; i++) { + j = 0; + sprintf(result[j++], "%8u", i); + sprintf(result[j++], "%9u", ring->tx_copybreak); + sprintf(result[j++], "%3u", tx_spare->len); + sprintf(result[j++], "%3u", tx_spare->next_to_use); + sprintf(result[j++], "%3u", tx_spare->next_to_clean); + sprintf(result[j++], "%3u", tx_spare->last_to_clean); + sprintf(result[j++], "%pad", &tx_spare->dma); + hns3_dbg_fill_content(content, sizeof(content), + tx_spare_info_items, + (const char **)result, + ARRAY_SIZE(tx_spare_info_items)); + *pos += scnprintf(buf + *pos, len - *pos, "%s", content); + } +} + static const struct hns3_dbg_item rx_queue_info_items[] = { { "QUEUE_ID", 2 }, { "BD_NUM", 2 }, @@ -593,6 +643,8 @@ static int hns3_dbg_tx_queue_info(struct hnae3_handle *h, pos += scnprintf(buf + pos, len - pos, "%s", content); }
+ hns3_dbg_tx_spare_info(ring, buf, len, h->kinfo.num_tqps, &pos); + return 0; }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 6fa1ed5..e5466da 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -53,6 +53,10 @@ static int debug = -1; module_param(debug, int, 0); MODULE_PARM_DESC(debug, " Network interface message level setting");
+static unsigned int tx_spare_buf_size; +module_param(tx_spare_buf_size, uint, 0400); +MODULE_PARM_DESC(tx_spare_buf_size, "Size used to allocate tx spare buffer"); + #define DEFAULT_MSG_LEVEL (NETIF_MSG_PROBE | NETIF_MSG_LINK | \ NETIF_MSG_IFDOWN | NETIF_MSG_IFUP)
@@ -941,6 +945,177 @@ void hns3_request_update_promisc_mode(struct hnae3_handle *handle) ops->request_update_promisc_mode(handle); }
+static u32 hns3_tx_spare_space(struct hns3_enet_ring *ring) +{ + struct hns3_tx_spare *tx_spare = ring->tx_spare; + u32 ntc, ntu; + + /* This smp_load_acquire() pairs with smp_store_release() in + * hns3_tx_spare_update() called in tx desc cleaning process. + */ + ntc = smp_load_acquire(&tx_spare->last_to_clean); + ntu = tx_spare->next_to_use; + + if (ntc > ntu) + return ntc - ntu - 1; + + /* The free tx buffer is divided into two part, so pick the + * larger one. + */ + return (ntc > (tx_spare->len - ntu) ? ntc : + (tx_spare->len - ntu)) - 1; +} + +static void hns3_tx_spare_update(struct hns3_enet_ring *ring) +{ + struct hns3_tx_spare *tx_spare = ring->tx_spare; + + if (!tx_spare || + tx_spare->last_to_clean == tx_spare->next_to_clean) + return; + + /* This smp_store_release() pairs with smp_load_acquire() in + * hns3_tx_spare_space() called in xmit process. + */ + smp_store_release(&tx_spare->last_to_clean, + tx_spare->next_to_clean); +} + +static bool hns3_can_use_tx_bounce(struct hns3_enet_ring *ring, + struct sk_buff *skb, + u32 space) +{ + u32 len = skb->len <= ring->tx_copybreak ? skb->len : + skb_headlen(skb); + + if (len > ring->tx_copybreak) + return false; + + if (ALIGN(len, dma_get_cache_alignment()) > space) { + u64_stats_update_begin(&ring->syncp); + ring->stats.tx_spare_full++; + u64_stats_update_end(&ring->syncp); + return false; + } + + return true; +} + +static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) +{ + struct hns3_tx_spare *tx_spare; + struct page *page; + dma_addr_t dma; + int order; + + if (!tx_spare_buf_size) + return; + + order = get_order(tx_spare_buf_size); + tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare), + GFP_KERNEL); + if (!tx_spare) { + /* The driver still work without the tx spare buffer */ + dev_warn(ring_to_dev(ring), "failed to allocate hns3_tx_spare\n"); + return; + } + + page = alloc_pages_node(dev_to_node(ring_to_dev(ring)), + GFP_KERNEL, order); + if (!page) { + dev_warn(ring_to_dev(ring), "failed to allocate tx spare pages\n"); + devm_kfree(ring_to_dev(ring), tx_spare); + return; + } + + dma = dma_map_page(ring_to_dev(ring), page, 0, + PAGE_SIZE << order, DMA_TO_DEVICE); + if (dma_mapping_error(ring_to_dev(ring), dma)) { + dev_warn(ring_to_dev(ring), "failed to map pages for tx spare\n"); + put_page(page); + devm_kfree(ring_to_dev(ring), tx_spare); + return; + } + + tx_spare->dma = dma; + tx_spare->buf = page_address(page); + tx_spare->len = PAGE_SIZE << order; + ring->tx_spare = tx_spare; +} + +/* Use hns3_tx_spare_space() to make sure there is enough buffer + * before calling below function to allocate tx buffer. + */ +static void *hns3_tx_spare_alloc(struct hns3_enet_ring *ring, + unsigned int size, dma_addr_t *dma, + u32 *cb_len) +{ + struct hns3_tx_spare *tx_spare = ring->tx_spare; + u32 ntu = tx_spare->next_to_use; + + size = ALIGN(size, dma_get_cache_alignment()); + *cb_len = size; + + /* Tx spare buffer wraps back here because the end of + * freed tx buffer is not enough. + */ + if (ntu + size > tx_spare->len) { + *cb_len += (tx_spare->len - ntu); + ntu = 0; + } + + tx_spare->next_to_use = ntu + size; + if (tx_spare->next_to_use == tx_spare->len) + tx_spare->next_to_use = 0; + + *dma = tx_spare->dma + ntu; + + return tx_spare->buf + ntu; +} + +static void hns3_tx_spare_rollback(struct hns3_enet_ring *ring, u32 len) +{ + struct hns3_tx_spare *tx_spare = ring->tx_spare; + + if (len > tx_spare->next_to_use) { + len -= tx_spare->next_to_use; + tx_spare->next_to_use = tx_spare->len - len; + } else { + tx_spare->next_to_use -= len; + } +} + +static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring, + struct hns3_desc_cb *cb) +{ + struct hns3_tx_spare *tx_spare = ring->tx_spare; + u32 ntc = tx_spare->next_to_clean; + u32 len = cb->length; + + tx_spare->next_to_clean += len; + + if (tx_spare->next_to_clean >= tx_spare->len) { + tx_spare->next_to_clean -= tx_spare->len; + + if (tx_spare->next_to_clean) { + ntc = 0; + len = tx_spare->next_to_clean; + } + } + + /* This tx spare buffer is only really reclaimed after calling + * hns3_tx_spare_update(), so it is still safe to use the info in + * the tx buffer to do the dma sync after tx_spare->next_to_clean + * is moved forword. + */ + if (cb->type & (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) { + dma_addr_t dma = tx_spare->dma + ntc; + + dma_sync_single_for_cpu(ring_to_dev(ring), dma, len, + DMA_TO_DEVICE); + } +} + static int hns3_set_tso(struct sk_buff *skb, u32 *paylen_fdop_ol4cs, u16 *mss, u32 *type_cs_vlan_tso, u32 *send_bytes) { @@ -1471,6 +1646,11 @@ static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv, return 0;
dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE); + } else if (type & DESC_TYPE_BOUNCE_HEAD) { + /* Head data has been filled in hns3_handle_tx_bounce(), + * just return 0 here. + */ + return 0; } else { skb_frag_t *frag = (skb_frag_t *)priv;
@@ -1739,6 +1919,9 @@ static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig) if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) dma_unmap_single(dev, desc_cb->dma, desc_cb->length, DMA_TO_DEVICE); + else if (desc_cb->type & + (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) + hns3_tx_spare_rollback(ring, desc_cb->length); else if (desc_cb->length) dma_unmap_page(dev, desc_cb->dma, desc_cb->length, DMA_TO_DEVICE); @@ -1816,6 +1999,79 @@ static void hns3_tsyn(struct net_device *netdev, struct sk_buff *skb, desc->tx.bdtp_fe_sc_vld_ra_ri |= cpu_to_le16(BIT(HNS3_TXD_TSYN_B)); }
+static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring, + struct sk_buff *skb) +{ + struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; + unsigned int type = DESC_TYPE_BOUNCE_HEAD; + unsigned int size = skb_headlen(skb); + dma_addr_t dma; + int bd_num = 0; + u32 cb_len; + void *buf; + int ret; + + if (skb->len <= ring->tx_copybreak) { + size = skb->len; + type = DESC_TYPE_BOUNCE_ALL; + } + + /* hns3_can_use_tx_bounce() is called to ensure the below + * function can always return the tx buffer. + */ + buf = hns3_tx_spare_alloc(ring, size, &dma, &cb_len); + + ret = skb_copy_bits(skb, 0, buf, size); + if (unlikely(ret < 0)) { + hns3_tx_spare_rollback(ring, cb_len); + u64_stats_update_begin(&ring->syncp); + ring->stats.copy_bits_err++; + u64_stats_update_end(&ring->syncp); + return ret; + } + + desc_cb->priv = skb; + desc_cb->length = cb_len; + desc_cb->dma = dma; + desc_cb->type = type; + + bd_num += hns3_fill_desc(ring, dma, size); + + if (type == DESC_TYPE_BOUNCE_HEAD) { + ret = hns3_fill_skb_to_desc(ring, skb, + DESC_TYPE_BOUNCE_HEAD); + if (unlikely(ret < 0)) + return ret; + + bd_num += ret; + } + + dma_sync_single_for_device(ring_to_dev(ring), dma, size, + DMA_TO_DEVICE); + + u64_stats_update_begin(&ring->syncp); + ring->stats.tx_bounce++; + u64_stats_update_end(&ring->syncp); + return bd_num; +} + +static int hns3_handle_desc_filling(struct hns3_enet_ring *ring, + struct sk_buff *skb) +{ + u32 space; + + if (!ring->tx_spare) + goto out; + + space = hns3_tx_spare_space(ring); + + if (hns3_can_use_tx_bounce(ring, skb, space)) + return hns3_handle_tx_bounce(ring, skb); + +out: + return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB); +} + netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) { struct hns3_nic_priv *priv = netdev_priv(netdev); @@ -1862,7 +2118,7 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) * zero, which is unlikely, and 'ret > 0' means how many tx desc * need to be notified to the hw. */ - ret = hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB); + ret = hns3_handle_desc_filling(ring, skb); if (unlikely(ret <= 0)) goto fill_err;
@@ -2064,6 +2320,7 @@ static void hns3_nic_get_stats64(struct net_device *netdev, tx_drop += ring->stats.tx_tso_err; tx_drop += ring->stats.over_max_recursion; tx_drop += ring->stats.hw_limitation; + tx_drop += ring->stats.copy_bits_err; tx_errors += ring->stats.sw_err_cnt; tx_errors += ring->stats.tx_vlan_err; tx_errors += ring->stats.tx_l4_proto_err; @@ -2071,6 +2328,7 @@ static void hns3_nic_get_stats64(struct net_device *netdev, tx_errors += ring->stats.tx_tso_err; tx_errors += ring->stats.over_max_recursion; tx_errors += ring->stats.hw_limitation; + tx_errors += ring->stats.copy_bits_err; } while (u64_stats_fetch_retry_irq(&ring->syncp, start));
/* fetch the rx stats */ @@ -2864,7 +3122,8 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring, static void hns3_free_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb, int budget) { - if (cb->type & DESC_TYPE_SKB) + if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | + DESC_TYPE_BOUNCE_ALL)) napi_consume_skb(cb->priv, budget); else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias) __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); @@ -2888,9 +3147,11 @@ static void hns3_unmap_buffer(struct hns3_enet_ring *ring, if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) dma_unmap_single(ring_to_dev(ring), cb->dma, cb->length, ring_to_dma_dir(ring)); - else if (cb->length) + else if ((cb->type & DESC_TYPE_PAGE) && cb->length) dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length, ring_to_dma_dir(ring)); + else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD)) + hns3_tx_spare_reclaim_cb(ring, cb); }
static void hns3_buffer_detach(struct hns3_enet_ring *ring, int i) @@ -3042,7 +3303,8 @@ static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring,
desc_cb = &ring->desc_cb[ntc];
- if (desc_cb->type & DESC_TYPE_SKB) { + if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_ALL | + DESC_TYPE_BOUNCE_HEAD)) { (*pkts)++; (*bytes) += desc_cb->send_bytes; } @@ -3065,6 +3327,9 @@ static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring, * ring_space called by hns3_nic_net_xmit. */ smp_store_release(&ring->next_to_clean, ntc); + + hns3_tx_spare_update(ring); + return true; }
@@ -4245,6 +4510,8 @@ static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, ring = &priv->ring[q->tqp_index]; desc_num = priv->ae_handle->kinfo.num_tx_desc; ring->queue_index = q->tqp_index; + ring->tx_copybreak = priv->tx_copybreak; + ring->last_to_use = 0; } else { ring = &priv->ring[q->tqp_index + queue_num]; desc_num = priv->ae_handle->kinfo.num_rx_desc; @@ -4262,7 +4529,6 @@ static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, ring->desc_num = desc_num; ring->next_to_use = 0; ring->next_to_clean = 0; - ring->last_to_use = 0; }
static void hns3_queue_to_ring(struct hnae3_queue *tqp, @@ -4322,6 +4588,8 @@ static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) ret = hns3_alloc_ring_buffers(ring); if (ret) goto out_with_desc; + } else { + hns3_init_tx_spare_buffer(ring); }
return 0; @@ -4344,9 +4612,18 @@ void hns3_fini_ring(struct hns3_enet_ring *ring) ring->next_to_use = 0; ring->last_to_use = 0; ring->pending_buf = 0; - if (ring->skb) { + if (!HNAE3_IS_TX_RING(ring) && ring->skb) { dev_kfree_skb_any(ring->skb); ring->skb = NULL; + } else if (HNAE3_IS_TX_RING(ring) && ring->tx_spare) { + struct hns3_tx_spare *tx_spare = ring->tx_spare; + + dma_unmap_page(ring_to_dev(ring), tx_spare->dma, tx_spare->len, + DMA_TO_DEVICE); + free_pages((unsigned long)tx_spare->buf, + get_order(tx_spare->len)); + devm_kfree(ring_to_dev(ring), tx_spare); + ring->tx_spare = NULL; } }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 9d18b94..8d147c1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -304,6 +304,8 @@ enum hns3_desc_type { DESC_TYPE_SKB = 1 << 0, DESC_TYPE_FRAGLIST_SKB = 1 << 1, DESC_TYPE_PAGE = 1 << 2, + DESC_TYPE_BOUNCE_ALL = 1 << 3, + DESC_TYPE_BOUNCE_HEAD = 1 << 4, };
struct hns3_desc_cb { @@ -405,6 +407,9 @@ struct ring_stats { u64 tx_tso_err; u64 over_max_recursion; u64 hw_limitation; + u64 tx_bounce; + u64 tx_spare_full; + u64 copy_bits_err; }; struct { u64 rx_pkts; @@ -423,6 +428,15 @@ struct ring_stats { }; };
+struct hns3_tx_spare { + dma_addr_t dma; + void *buf; + u32 next_to_use; + u32 next_to_clean; + u32 last_to_clean; + u32 len; +}; + struct hns3_enet_ring { struct hns3_desc *desc; /* dma map address space */ struct hns3_desc_cb *desc_cb; @@ -445,18 +459,28 @@ struct hns3_enet_ring { * next_to_use */ int next_to_clean; - union { - int last_to_use; /* last idx used by xmit */ - u32 pull_len; /* memcpy len for current rx packet */ - }; - u32 frag_num; - void *va; /* first buffer address for current packet */ - u32 flag; /* ring attribute */
int pending_buf; - struct sk_buff *skb; - struct sk_buff *tail_skb; + union { + /* for Tx ring */ + struct { + u32 fd_qb_tx_sample; + int last_to_use; /* last idx used by xmit */ + u32 tx_copybreak; + struct hns3_tx_spare *tx_spare; + }; + + /* for Rx ring */ + struct { + u32 pull_len; /* memcpy len for current rx packet */ + u32 frag_num; + /* first buffer address for current packet */ + unsigned char *va; + struct sk_buff *skb; + struct sk_buff *tail_skb; + }; + }; } ____cacheline_internodealigned_in_smp;
enum hns3_flow_level_range { @@ -540,6 +564,7 @@ struct hns3_nic_priv {
struct hns3_enet_coalesce tx_coal; struct hns3_enet_coalesce rx_coal; + u32 tx_copybreak; };
union l3_hdr_info { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index acef543..f306de1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -46,6 +46,9 @@ static const struct hns3_stats hns3_txq_stats[] = { HNS3_TQP_STAT("tso_err", tx_tso_err), HNS3_TQP_STAT("over_max_recursion", over_max_recursion), HNS3_TQP_STAT("hw_limitation", hw_limitation), + HNS3_TQP_STAT("bounce", tx_bounce), + HNS3_TQP_STAT("spare_full", tx_spare_full), + HNS3_TQP_STAT("copy_bits_err", copy_bits_err), };
#define HNS3_TXQ_STATS_COUNT ARRAY_SIZE(hns3_txq_stats) @@ -1592,6 +1595,50 @@ static int hns3_set_priv_flags(struct net_device *netdev, u32 pflags) return 0; }
+static int hns3_get_tunable(struct net_device *netdev, + const struct ethtool_tunable *tuna, + void *data) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + int ret = 0; + + switch (tuna->id) { + case ETHTOOL_TX_COPYBREAK: + /* all the tx rings have the same tx_copybreak */ + *(u32 *)data = priv->tx_copybreak; + break; + default: + ret = -EOPNOTSUPP; + break; + } + + return ret; +} + +static int hns3_set_tunable(struct net_device *netdev, + const struct ethtool_tunable *tuna, + const void *data) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + struct hnae3_handle *h = priv->ae_handle; + int i, ret = 0; + + switch (tuna->id) { + case ETHTOOL_TX_COPYBREAK: + priv->tx_copybreak = *(u32 *)data; + + for (i = 0; i < h->kinfo.num_tqps; i++) + priv->ring[i].tx_copybreak = priv->tx_copybreak; + + break; + default: + ret = -EOPNOTSUPP; + break; + } + + return ret; +} + #define HNS3_ETHTOOL_COALESCE (ETHTOOL_COALESCE_USECS | \ ETHTOOL_COALESCE_USE_ADAPTIVE | \ ETHTOOL_COALESCE_RX_USECS_HIGH | \ @@ -1635,6 +1682,8 @@ static const struct ethtool_ops hns3vf_ethtool_ops = { .set_msglevel = hns3_set_msglevel, .get_priv_flags = hns3_get_priv_flags, .set_priv_flags = hns3_set_priv_flags, + .get_tunable = hns3_get_tunable, + .set_tunable = hns3_set_tunable, };
static const struct ethtool_ops hns3_ethtool_ops = { @@ -1674,6 +1723,8 @@ static const struct ethtool_ops hns3_ethtool_ops = { .get_priv_flags = hns3_get_priv_flags, .set_priv_flags = hns3_set_priv_flags, .get_ts_info = hns3_get_ts_info, + .get_tunable = hns3_get_tunable, + .set_tunable = hns3_set_tunable, };
void hns3_ethtool_set_ops(struct net_device *netdev)
From: Huazhong Tan tanhuazhong@huawei.com
Add support to query tx spare buffer size from configuration file, and use this info to do spare buffer initialization when the module parameter 'tx_spare_buf_size' is not specified.
Signed-off-by: Huazhong Tan tanhuazhong@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 7 +++++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 ++++++++++++++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 ++ 5 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 5822fc0..0b202f4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -760,6 +760,7 @@ struct hnae3_knic_private_info { u16 rx_buf_len; u16 num_tx_desc; u16 num_rx_desc; + u32 tx_spare_buf_size;
struct hnae3_tc_info tc_info;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index e5466da..d86b373 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1005,13 +1005,16 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) { struct hns3_tx_spare *tx_spare; struct page *page; + u32 alloc_size; dma_addr_t dma; int order;
- if (!tx_spare_buf_size) + alloc_size = tx_spare_buf_size ? tx_spare_buf_size : + ring->tqp->handle->kinfo.tx_spare_buf_size; + if (!alloc_size) return;
- order = get_order(tx_spare_buf_size); + order = get_order(alloc_size); tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare), GFP_KERNEL); if (!tx_spare) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 51be76f..a322dfe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -542,6 +542,8 @@ struct hclge_pf_res_cmd { #define HCLGE_CFG_UMV_TBL_SPACE_M GENMASK(31, 16) #define HCLGE_CFG_PF_RSS_SIZE_S 0 #define HCLGE_CFG_PF_RSS_SIZE_M GENMASK(3, 0) +#define HCLGE_CFG_TX_SPARE_BUF_SIZE_S 4 +#define HCLGE_CFG_TX_SPARE_BUF_SIZE_M GENMASK(15, 4)
#define HCLGE_CFG_CMD_CNT 4
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index f6fdf93..f3e482a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1279,6 +1279,7 @@ static u32 hclge_get_max_speed(u16 speed_ability)
static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc) { +#define HCLGE_TX_SPARE_SIZE_UNIT 4096 #define SPEED_ABILITY_EXT_SHIFT 8
struct hclge_cfg_param_cmd *req; @@ -1358,6 +1359,15 @@ static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc) cfg->pf_rss_size_max = cfg->pf_rss_size_max ? 1U << cfg->pf_rss_size_max : cfg->vf_rss_size_max; + + /* The unit of the tx spare buffer size queried from configuration + * file is HCLGE_TX_SPARE_SIZE_UNIT(4096) bytes, so a conversion is + * needed here. + */ + cfg->tx_spare_buf_size = hnae3_get_field(__le32_to_cpu(req->param[2]), + HCLGE_CFG_TX_SPARE_BUF_SIZE_M, + HCLGE_CFG_TX_SPARE_BUF_SIZE_S); + cfg->tx_spare_buf_size *= HCLGE_TX_SPARE_SIZE_UNIT; }
/* hclge_get_cfg: query the static parameter from flash @@ -1539,6 +1549,7 @@ static int hclge_configure(struct hclge_dev *hdev) hdev->tc_max = cfg.tc_num; hdev->tm_info.hw_pfc_map = 0; hdev->wanted_umv_size = cfg.umv_space; + hdev->tx_spare_buf_size = cfg.tx_spare_buf_size; if (cfg.vlan_fliter_cap == HCLGE_VLAN_FLTR_CAN_MDF) set_bit(HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, ae_dev->caps);
@@ -1736,6 +1747,7 @@ static int hclge_knic_setup(struct hclge_vport *vport, u16 num_tqps, kinfo->num_rx_desc = num_rx_desc;
kinfo->rx_buf_len = hdev->rx_buf_len; + kinfo->tx_spare_buf_size = hdev->tx_spare_buf_size;
kinfo->tqp = devm_kcalloc(&hdev->pdev->dev, num_tqps, sizeof(struct hnae3_queue *), GFP_KERNEL); @@ -11059,6 +11071,8 @@ static void hclge_info_show(struct hclge_dev *hdev) hdev->flag & HCLGE_FLAG_DCB_ENABLE ? "enable" : "disable"); dev_info(dev, "MQPRIO %s\n", hdev->flag & HCLGE_FLAG_MQPRIO_ENABLE ? "enable" : "disable"); + dev_info(dev, "Default tx spare buffer size: %u\n", + hdev->tx_spare_buf_size);
dev_info(dev, "PF info end.\n"); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 0285273..3d33524 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -384,6 +384,7 @@ struct hclge_cfg { u8 mac_addr[ETH_ALEN]; u8 default_speed; u32 numa_node_map; + u32 tx_spare_buf_size; u16 speed_ability; u16 umv_space; }; @@ -848,6 +849,7 @@ struct hclge_dev { u16 alloc_rss_size; /* Allocated RSS task queue */ u16 vf_rss_size_max; /* HW defined VF max RSS task queue */ u16 pf_rss_size_max; /* HW defined PF max RSS task queue */ + u32 tx_spare_buf_size; /* HW defined TX spare buffer size */
u16 fdir_pf_filter_count; /* Num of guaranteed filters for this PF */ u16 num_alloc_vport; /* Num vports this driver supports */
From: Yunsheng Lin linyunsheng@huawei.com
Using the queue based tx buffer, it is also possible to allocate a sgl buffer, and use skb_to_sgvec() to convert the skb to the sgvec in order to support the dma_map_sg() to decreases the overhead of IOMMU mapping and unmapping.
Firstly, it reduces the number of buffers. For example, a tcp skb may have a 66-byte header and 3 fragments of 4328, 32768, and 28064 bytes. With this patch, dma_map_sg() will combine them into two buffers, 66-bytes header and one 65160-bytes fragment by using IOMMU.
Secondly, it reduces the number of dma mapping and unmapping. All the original 4 buffers are mapped only once rather than 4 times.
The throughput improves above 10% when running single thread of iperf using TCP when IOMMU is in strict mode.
Suggested-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 111 ++++++++++++++++++++- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 4 + drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 3 + 3 files changed, 113 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index d86b373..f60a344 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -57,6 +57,15 @@ static unsigned int tx_spare_buf_size; module_param(tx_spare_buf_size, uint, 0400); MODULE_PARM_DESC(tx_spare_buf_size, "Size used to allocate tx spare buffer");
+static unsigned int tx_sgl = 1; +module_param(tx_sgl, uint, 0600); +MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping"); + +#define HNS3_SGL_SIZE(nfrag) (sizeof(struct scatterlist) * (nfrag) + \ + sizeof(struct sg_table)) +#define HNS3_MAX_SGL_SIZE ALIGN(HNS3_SGL_SIZE(HNS3_MAX_TSO_BD_NUM),\ + dma_get_cache_alignment()) + #define DEFAULT_MSG_LEVEL (NETIF_MSG_PROBE | NETIF_MSG_LINK | \ NETIF_MSG_IFDOWN | NETIF_MSG_IFUP)
@@ -1001,6 +1010,25 @@ static bool hns3_can_use_tx_bounce(struct hns3_enet_ring *ring, return true; }
+static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring, + struct sk_buff *skb, + u32 space) +{ + if (skb->len <= ring->tx_copybreak || !tx_sgl || + (!skb_has_frag_list(skb) && + skb_shinfo(skb)->nr_frags < tx_sgl)) + return false; + + if (space < HNS3_MAX_SGL_SIZE) { + u64_stats_update_begin(&ring->syncp); + ring->stats.tx_spare_full++; + u64_stats_update_end(&ring->syncp); + return false; + } + + return true; +} + static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) { struct hns3_tx_spare *tx_spare; @@ -1108,14 +1136,19 @@ static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring,
/* This tx spare buffer is only really reclaimed after calling * hns3_tx_spare_update(), so it is still safe to use the info in - * the tx buffer to do the dma sync after tx_spare->next_to_clean - * is moved forword. + * the tx buffer to do the dma sync or sg unmapping after + * tx_spare->next_to_clean is moved forword. */ if (cb->type & (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) { dma_addr_t dma = tx_spare->dma + ntc;
dma_sync_single_for_cpu(ring_to_dev(ring), dma, len, DMA_TO_DEVICE); + } else { + struct sg_table *sgt = tx_spare->buf + ntc; + + dma_unmap_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents, + DMA_TO_DEVICE); } }
@@ -2058,6 +2091,65 @@ static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring, return bd_num; }
+static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring, + struct sk_buff *skb) +{ + struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; + u32 nfrag = skb_shinfo(skb)->nr_frags + 1; + struct sg_table *sgt; + int i, bd_num = 0; + dma_addr_t dma; + u32 cb_len; + int nents; + + if (skb_has_frag_list(skb)) + nfrag = HNS3_MAX_TSO_BD_NUM; + + /* hns3_can_use_tx_sgl() is called to ensure the below + * function can always return the tx buffer. + */ + sgt = hns3_tx_spare_alloc(ring, HNS3_SGL_SIZE(nfrag), + &dma, &cb_len); + + /* scatterlist follows by the sg table */ + sgt->sgl = (struct scatterlist *)(sgt + 1); + sg_init_table(sgt->sgl, nfrag); + nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len); + if (unlikely(nents < 0)) { + hns3_tx_spare_rollback(ring, cb_len); + u64_stats_update_begin(&ring->syncp); + ring->stats.skb2sgl_err++; + u64_stats_update_end(&ring->syncp); + return -ENOMEM; + } + + sgt->orig_nents = nents; + sgt->nents = dma_map_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents, + DMA_TO_DEVICE); + if (unlikely(!sgt->nents)) { + hns3_tx_spare_rollback(ring, cb_len); + u64_stats_update_begin(&ring->syncp); + ring->stats.map_sg_err++; + u64_stats_update_end(&ring->syncp); + return -ENOMEM; + } + + desc_cb->priv = skb; + desc_cb->length = cb_len; + desc_cb->dma = dma; + desc_cb->type = DESC_TYPE_SGL_SKB; + + for (i = 0; i < sgt->nents; i++) + bd_num += hns3_fill_desc(ring, sg_dma_address(sgt->sgl + i), + sg_dma_len(sgt->sgl + i)); + + u64_stats_update_begin(&ring->syncp); + ring->stats.tx_sgl++; + u64_stats_update_end(&ring->syncp); + + return bd_num; +} + static int hns3_handle_desc_filling(struct hns3_enet_ring *ring, struct sk_buff *skb) { @@ -2068,6 +2160,9 @@ static int hns3_handle_desc_filling(struct hns3_enet_ring *ring,
space = hns3_tx_spare_space(ring);
+ if (hns3_can_use_tx_sgl(ring, skb, space)) + return hns3_handle_tx_sgl(ring, skb); + if (hns3_can_use_tx_bounce(ring, skb, space)) return hns3_handle_tx_bounce(ring, skb);
@@ -2324,6 +2419,8 @@ static void hns3_nic_get_stats64(struct net_device *netdev, tx_drop += ring->stats.over_max_recursion; tx_drop += ring->stats.hw_limitation; tx_drop += ring->stats.copy_bits_err; + tx_drop += ring->stats.skb2sgl_err; + tx_drop += ring->stats.map_sg_err; tx_errors += ring->stats.sw_err_cnt; tx_errors += ring->stats.tx_vlan_err; tx_errors += ring->stats.tx_l4_proto_err; @@ -2332,6 +2429,8 @@ static void hns3_nic_get_stats64(struct net_device *netdev, tx_errors += ring->stats.over_max_recursion; tx_errors += ring->stats.hw_limitation; tx_errors += ring->stats.copy_bits_err; + tx_errors += ring->stats.skb2sgl_err; + tx_errors += ring->stats.map_sg_err; } while (u64_stats_fetch_retry_irq(&ring->syncp, start));
/* fetch the rx stats */ @@ -3126,7 +3225,7 @@ static void hns3_free_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb, int budget) { if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | - DESC_TYPE_BOUNCE_ALL)) + DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB)) napi_consume_skb(cb->priv, budget); else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias) __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); @@ -3153,7 +3252,8 @@ static void hns3_unmap_buffer(struct hns3_enet_ring *ring, else if ((cb->type & DESC_TYPE_PAGE) && cb->length) dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length, ring_to_dma_dir(ring)); - else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD)) + else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD | + DESC_TYPE_SGL_SKB)) hns3_tx_spare_reclaim_cb(ring, cb); }
@@ -3307,7 +3407,8 @@ static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring, desc_cb = &ring->desc_cb[ntc];
if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_ALL | - DESC_TYPE_BOUNCE_HEAD)) { + DESC_TYPE_BOUNCE_HEAD | + DESC_TYPE_SGL_SKB)) { (*pkts)++; (*bytes) += desc_cb->send_bytes; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 8d147c1..22ae291 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -306,6 +306,7 @@ enum hns3_desc_type { DESC_TYPE_PAGE = 1 << 2, DESC_TYPE_BOUNCE_ALL = 1 << 3, DESC_TYPE_BOUNCE_HEAD = 1 << 4, + DESC_TYPE_SGL_SKB = 1 << 5, };
struct hns3_desc_cb { @@ -410,6 +411,9 @@ struct ring_stats { u64 tx_bounce; u64 tx_spare_full; u64 copy_bits_err; + u64 tx_sgl; + u64 skb2sgl_err; + u64 map_sg_err; }; struct { u64 rx_pkts; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index f306de1..d785271 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -49,6 +49,9 @@ static const struct hns3_stats hns3_txq_stats[] = { HNS3_TQP_STAT("bounce", tx_bounce), HNS3_TQP_STAT("spare_full", tx_spare_full), HNS3_TQP_STAT("copy_bits_err", copy_bits_err), + HNS3_TQP_STAT("sgl", tx_sgl), + HNS3_TQP_STAT("skb2sgl_err", skb2sgl_err), + HNS3_TQP_STAT("map_sg_err", map_sg_err), };
#define HNS3_TXQ_STATS_COUNT ARRAY_SIZE(hns3_txq_stats)
From: Yunsheng Lin linyunsheng@huawei.com
Current rx page offset only reset to zero when all the below conditions are satisfied: 1. rx page is only owned by driver. 2. rx page is reusable. 3. the page offset that is above to be given to the stack has reached the end of the page.
If the page offset is over the hns3_buf_size(), it means the buffer below the offset of the page is usable when the above condition 1 & 2 are satisfied, so page offset can be reset to zero instead of increasing the offset. We may be able to always reuse the first 4K buffer of a 64K page, which means we can limit the hot buffer size as much as possible.
The above optimization is a side effect when refacting the rx page reuse handling in order to support the rx copybreak.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 44 ++++++++++++------------- 1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index f60a344..98e8a54 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3525,7 +3525,7 @@ static void hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring,
static bool hns3_can_reuse_page(struct hns3_desc_cb *cb) { - return (page_count(cb->priv) - cb->pagecnt_bias) == 1; + return page_count(cb->priv) == cb->pagecnt_bias; }
static void hns3_nic_reuse_page(struct sk_buff *skb, int i, @@ -3533,40 +3533,40 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, struct hns3_desc_cb *desc_cb) { struct hns3_desc *desc = &ring->desc[ring->next_to_clean]; + u32 frag_offset = desc_cb->page_offset + pull_len; int size = le16_to_cpu(desc->rx.size); u32 truesize = hns3_buf_size(ring); + u32 frag_size = size - pull_len;
- desc_cb->pagecnt_bias--; - skb_add_rx_frag(skb, i, desc_cb->priv, desc_cb->page_offset + pull_len, - size - pull_len, truesize); + /* Avoid re-using remote or pfmem page */ + if (unlikely(!dev_page_is_reusable(desc_cb->priv))) + goto out;
- /* Avoid re-using remote and pfmemalloc pages, or the stack is still - * using the page when page_offset rollback to zero, flag default - * unreuse + /* Stack is not using and current page_offset is non-zero, we can + * reuse from the zero offset. */ - if (!dev_page_is_reusable(desc_cb->priv) || - (!desc_cb->page_offset && !hns3_can_reuse_page(desc_cb))) { - __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); - return; - } - - /* Move offset up to the next cache line */ - desc_cb->page_offset += truesize; - - if (desc_cb->page_offset + truesize <= hns3_page_size(ring)) { + if (desc_cb->page_offset && hns3_can_reuse_page(desc_cb)) { + desc_cb->page_offset = 0; desc_cb->reuse_flag = 1; - } else if (hns3_can_reuse_page(desc_cb)) { + } else if (desc_cb->page_offset + truesize * 2 <= + hns3_page_size(ring)) { + desc_cb->page_offset += truesize; desc_cb->reuse_flag = 1; - desc_cb->page_offset = 0; - } else if (desc_cb->pagecnt_bias) { - __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); - return; }
+out: + desc_cb->pagecnt_bias--; + if (unlikely(!desc_cb->pagecnt_bias)) { page_ref_add(desc_cb->priv, USHRT_MAX); desc_cb->pagecnt_bias = USHRT_MAX; } + + skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, + frag_size, truesize); + + if (unlikely(!desc_cb->reuse_flag)) + __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); }
static int hns3_gro_complete(struct sk_buff *skb, u32 l234info)
From: Yunsheng Lin linyunsheng@huawei.com
Currently rx page will be reused to receive future packet when the stack releases the previous skb quickly. If the old page can not be reused, a new page will be allocated and mapped, which comsumes a lot of cpu when IOMMU is in the strict mode, especially when the application and irq/NAPI happens to run on the same cpu.
So allocate a new frag to memcpy the data to avoid the costly IOMMU unmapping/mapping operation, and add "frag_alloc_err" and "frag_alloc" stats in "ethtool -S ethX" cmd.
The throughput improves above 50% when running single thread of iperf using TCP when IOMMU is in strict mode and iperf shares the same cpu with irq/NAPI(rx_copybreak = 2048 and mtu = 1500).
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 23 ++++++++++++++++++++++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 4 ++++ drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 12 +++++++++++ 4 files changed, 41 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index a24a75c..34b6cd9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -450,6 +450,7 @@ static const struct hns3_dbg_item rx_queue_info_items[] = { { "HEAD", 2 }, { "FBDNUM", 2 }, { "PKTNUM", 2 }, + { "COPYBREAK", 2 }, { "RING_EN", 2 }, { "RX_RING_EN", 2 }, { "BASE_ADDR", 10 }, @@ -481,6 +482,7 @@ static void hns3_dump_rx_queue_info(struct hns3_enet_ring *ring,
sprintf(result[j++], "%6u", readl_relaxed(ring->tqp->io_base + HNS3_RING_RX_RING_PKTNUM_RECORD_REG)); + sprintf(result[j++], "%9u", ring->rx_copybreak);
sprintf(result[j++], "%7s", readl_relaxed(ring->tqp->io_base + HNS3_RING_EN_REG) ? "on" : "off"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 98e8a54..51bbf5f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3552,6 +3552,28 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, hns3_page_size(ring)) { desc_cb->page_offset += truesize; desc_cb->reuse_flag = 1; + } else if (frag_size <= ring->rx_copybreak) { + void *frag = napi_alloc_frag(frag_size); + + if (unlikely(!frag)) { + u64_stats_update_begin(&ring->syncp); + ring->stats.frag_alloc_err++; + u64_stats_update_end(&ring->syncp); + + hns3_rl_err(ring_to_netdev(ring), + "failed to allocate rx frag\n"); + goto out; + } + + desc_cb->reuse_flag = 1; + memcpy(frag, desc_cb->buf + frag_offset, frag_size); + skb_add_rx_frag(skb, i, virt_to_page(frag), + offset_in_page(frag), frag_size, frag_size); + + u64_stats_update_begin(&ring->syncp); + ring->stats.frag_alloc++; + u64_stats_update_end(&ring->syncp); + return; }
out: @@ -4620,6 +4642,7 @@ static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, ring = &priv->ring[q->tqp_index + queue_num]; desc_num = priv->ae_handle->kinfo.num_rx_desc; ring->queue_index = q->tqp_index; + ring->rx_copybreak = priv->rx_copybreak; }
hnae3_set_bit(ring->flag, HNAE3_RING_TYPE_B, ring_type); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 22ae291..15af3d9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -427,6 +427,8 @@ struct ring_stats { u64 csum_complete; u64 rx_multicast; u64 non_reuse_pg; + u64 frag_alloc_err; + u64 frag_alloc; }; __le16 csum; }; @@ -478,6 +480,7 @@ struct hns3_enet_ring { /* for Rx ring */ struct { u32 pull_len; /* memcpy len for current rx packet */ + u32 rx_copybreak; u32 frag_num; /* first buffer address for current packet */ unsigned char *va; @@ -569,6 +572,7 @@ struct hns3_nic_priv { struct hns3_enet_coalesce tx_coal; struct hns3_enet_coalesce rx_coal; u32 tx_copybreak; + u32 rx_copybreak; };
union l3_hdr_info { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index d785271..82061ab 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -71,6 +71,8 @@ static const struct hns3_stats hns3_rxq_stats[] = { HNS3_TQP_STAT("csum_complete", csum_complete), HNS3_TQP_STAT("multicast", rx_multicast), HNS3_TQP_STAT("non_reuse_pg", non_reuse_pg), + HNS3_TQP_STAT("frag_alloc_err", frag_alloc_err), + HNS3_TQP_STAT("frag_alloc", frag_alloc), };
#define HNS3_PRIV_FLAGS_LEN ARRAY_SIZE(hns3_priv_flags) @@ -1610,6 +1612,9 @@ static int hns3_get_tunable(struct net_device *netdev, /* all the tx rings have the same tx_copybreak */ *(u32 *)data = priv->tx_copybreak; break; + case ETHTOOL_RX_COPYBREAK: + *(u32 *)data = priv->rx_copybreak; + break; default: ret = -EOPNOTSUPP; break; @@ -1634,6 +1639,13 @@ static int hns3_set_tunable(struct net_device *netdev, priv->ring[i].tx_copybreak = priv->tx_copybreak;
break; + case ETHTOOL_RX_COPYBREAK: + priv->rx_copybreak = *(u32 *)data; + + for (i = h->kinfo.num_tqps; i < h->kinfo.num_tqps * 2; i++) + priv->ring[i].rx_copybreak = priv->rx_copybreak; + + break; default: ret = -EOPNOTSUPP; break;