This patchset adds frag page support in page pool and enable skb's page frag recycling based on page pool in hns3 drvier.
RFC v5: 1. Rename dma_addr[0] to pp_frag_count and adjust codes according to the rename.
RFC v4: 1. Use the dma_addr[1] to store bias. 2. Default to a pagecnt_bias of PAGE_SIZE - 1. 3. other minor comment suggested by Alexander.
RFC v3: 1. Implement the semantic of "page recycling only wait for the page pool user instead of all user of a page" 2. Support the frag allocation of different sizes 3. Merge patch 4 & 5 to one patch as it does not make sense to use page_pool_dev_alloc_pages() API directly with elevated refcnt. 4. other minor comment suggested by Alexander.
RFC v2: 1. Split patch 1 to more reviewable one. 2. Repurpose the lower 12 bits of the dma address to store the pagecnt_bias as suggested by Alexander. 3. support recycling to pool->alloc for elevated refcnt case too.
Yunsheng Lin (4): page_pool: keep pp info as long as page pool owns the page page_pool: add interface to manipulate frag count in page pool page_pool: add frag page recycling support in page pool net: hns3: support skb's frag page recycling based on page pool
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 82 +++++++++++++- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 3 + drivers/net/ethernet/marvell/mvneta.c | 6 +- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/cpsw_new.c | 2 +- include/linux/mm_types.h | 8 +- include/linux/skbuff.h | 4 +- include/net/page_pool.h | 83 +++++++++++--- net/core/page_pool.c | 140 +++++++++++++++++++++--- 10 files changed, 283 insertions(+), 49 deletions(-)
Currently, page->pp is cleared and set everytime the page is recycled, which is unnecessary.
So only set the page->pp when the page is added to the page pool and only clear it when the page is released from the page pool.
This is also a preparation to support allocating frag page in page pool.
Reviewed-by: Ilias Apalodimas ilias.apalodimas@linaro.org Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/marvell/mvneta.c | 6 +----- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/cpsw_new.c | 2 +- include/linux/skbuff.h | 4 +--- include/net/page_pool.h | 7 ------- net/core/page_pool.c | 21 +++++++++++++++++---- 7 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 361bc4f..89bf31fd 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, if (!skb) return ERR_PTR(-ENOMEM);
- skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool); + skb_mark_for_recycle(skb);
skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(frag), skb_frag_off(frag), skb_frag_size(frag), PAGE_SIZE); - /* We don't need to reset pp_recycle here. It's already set, so - * just mark fragments for recycling. - */ - page_pool_store_mem_info(skb_frag_page(frag), pool); }
return skb; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 3229baf..320eddb 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3995,7 +3995,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, }
if (pp) - skb_mark_for_recycle(skb, page, pp); + skb_mark_for_recycle(skb); else dma_unmap_single_attrs(dev->dev.parent, dma_addr, bm_pool->buf_size, DMA_FROM_DEVICE, diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index cbbd0f6..9d59143 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -431,7 +431,7 @@ static void cpsw_rx_handler(void *token, int len, int status) skb->protocol = eth_type_trans(skb, ndev);
/* mark skb for recycling */ - skb_mark_for_recycle(skb, page, pool); + skb_mark_for_recycle(skb); netif_receive_skb(skb);
ndev->stats.rx_bytes += len; diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 57d279f..a4234a3 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -374,7 +374,7 @@ static void cpsw_rx_handler(void *token, int len, int status) skb->protocol = eth_type_trans(skb, ndev);
/* mark skb for recycling */ - skb_mark_for_recycle(skb, page, pool); + skb_mark_for_recycle(skb); netif_receive_skb(skb);
ndev->stats.rx_bytes += len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2db9cd..7795979 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb) }
#ifdef CONFIG_PAGE_POOL -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, - struct page_pool *pp) +static inline void skb_mark_for_recycle(struct sk_buff *skb) { skb->pp_recycle = 1; - page_pool_store_mem_info(page, pp); } #endif
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 3dd62dd..8d7744d 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -253,11 +253,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool) spin_unlock_bh(&pool->ring.producer_lock); }
-/* Store mem_info on struct page and use it while recycling skb frags */ -static inline -void page_pool_store_mem_info(struct page *page, struct page_pool *pp) -{ - page->pp = pp; -} - #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5e4eb45..78838c6 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -206,6 +206,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; }
+static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -222,7 +235,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; }
- page->pp_magic |= PP_SIGNATURE; + page_pool_set_pp_info(pool, page);
/* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -266,7 +279,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } - page->pp_magic |= PP_SIGNATURE; + + page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -345,7 +359,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page->pp_magic = 0; + page_pool_clear_pp_info(page);
/* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. @@ -644,7 +658,6 @@ bool page_pool_return_skb_page(struct page *page) * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page->pp = NULL; page_pool_put_full_page(pp, page, false);
return true;
As suggested by Alexander, "A DMA mapping should be page aligned anyway so the lower 12 bits would be reserved 0", so it might make more sense to repurpose the lower 12 bits of the dma address to store the frag count for frag page support in page pool for 32 bit systems with 64 bit dma, which should be rare those days.
For normal system, the dma_addr[1] in 'struct page' is not used, so we can reuse one of the dma_addr for storing frag count, which means how many frags this page might be splited to.
The PAGE_POOL_DMA_USE_PP_FRAG_COUNT macro is added to decide where to store the frag count, as the "sizeof(dma_addr_t) > sizeof(unsigned long)" is false for most systems those days, so hopefully the compiler will optimize out the unused code for those systems.
The newly added page_pool_set_frag_count() should be called before the page is passed to any user. Otherwise, call the newly added page_pool_atomic_sub_frag_count_return().
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- include/linux/mm_types.h | 8 +++++-- include/net/page_pool.h | 54 ++++++++++++++++++++++++++++++++++++++++++------ net/core/page_pool.c | 10 +++++++++ 3 files changed, 64 insertions(+), 8 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d33d97c..82bcbb0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,11 +103,15 @@ struct page { unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; + atomic_long_t pp_frag_count; /** * @dma_addr: might require a 64-bit value on - * 32-bit architectures. + * 32-bit architectures, if so, store the lower 32 + * bits in pp_frag_count, and a DMA mapping should + * be page aligned, so the frag count can be stored + * in lower 12 bits for 4K page size. */ - unsigned long dma_addr[2]; + unsigned long dma_addr; }; struct { /* slab, slob and slub */ union { diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 8d7744d..ef449c2 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -198,19 +198,61 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); }
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) + static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - dma_addr_t ret = page->dma_addr[0]; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) { + ret <<= 32; + ret |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK; + } + return ret; }
static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { - page->dma_addr[0] = addr; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - page->dma_addr[1] = upper_32_bits(addr); + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) { + atomic_long_set(&page->pp_frag_count, addr & PAGE_MASK); + addr >>= 32; + } + + page->dma_addr = addr; +} + +static inline long page_pool_atomic_sub_frag_count_return(struct page *page, + long nr) +{ + long frag_count = atomic_long_read(&page->pp_frag_count); + long ret; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) { + if ((frag_count & ~PAGE_MASK) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON((ret & PAGE_MASK) != (frag_count & PAGE_MASK)); + ret &= ~PAGE_MASK; + } else { + if (frag_count == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + } + + return ret; +} + +static inline void page_pool_set_frag_count(struct page *page, long nr) +{ + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + nr |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK; + + atomic_long_set(&page->pp_frag_count, nr); }
static inline bool is_page_pool_compiled_in(void) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 78838c6..0082f33 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -198,6 +198,16 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) if (dma_mapping_error(pool->p.dev, dma)) return false;
+ if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + WARN_ON(pool->p.flags & PP_FLAG_PAGE_FRAG && + dma & ~PAGE_MASK)) { + dma_unmap_page_attrs(pool->p.dev, dma, + PAGE_SIZE << pool->p.order, + pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC); + return false; + } + page_pool_set_dma_addr(page, dma);
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
On 14/07/2021 11.34, Yunsheng Lin wrote:
As suggested by Alexander, "A DMA mapping should be page aligned anyway so the lower 12 bits would be reserved 0", so it might make more sense to repurpose the lower 12 bits of the dma address to store the frag count for frag page support in page pool for 32 bit systems with 64 bit dma, which should be rare those days.
Do we have any real driver users with 32-bit arch and 64-bit DMA, that want to use this new frag-count system you are adding to page_pool?
This "lower 12-bit use" complicates the code we need to maintain forever. My guess is that it is never used, but we need to update and maintain it, and it will never be tested.
Why don't you simply reject using page_pool flag PP_FLAG_PAGE_FRAG during setup of the page_pool for this case?
if ((pool->p.flags & PP_FLAG_PAGE_FRAG) && (sizeof(dma_addr_t) > sizeof(unsigned long))) goto reject-setup;
For normal system, the dma_addr[1] in 'struct page' is not used, so we can reuse one of the dma_addr for storing frag count, which means how many frags this page might be splited to.
The PAGE_POOL_DMA_USE_PP_FRAG_COUNT macro is added to decide where to store the frag count, as the "sizeof(dma_addr_t) > sizeof(unsigned long)" is false for most systems those days, so hopefully the compiler will optimize out the unused code for those systems.
The newly added page_pool_set_frag_count() should be called before the page is passed to any user. Otherwise, call the newly added page_pool_atomic_sub_frag_count_return().
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com
include/linux/mm_types.h | 8 +++++-- include/net/page_pool.h | 54 ++++++++++++++++++++++++++++++++++++++++++------ net/core/page_pool.c | 10 +++++++++ 3 files changed, 64 insertions(+), 8 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d33d97c..82bcbb0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,11 +103,15 @@ struct page { unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad;
atomic_long_t pp_frag_count; /** * @dma_addr: might require a 64-bit value on
* 32-bit architectures.
* 32-bit architectures, if so, store the lower 32
* bits in pp_frag_count, and a DMA mapping should
* be page aligned, so the frag count can be stored
* in lower 12 bits for 4K page size. */
unsigned long dma_addr[2];
}; struct { /* slab, slob and slub */ union {unsigned long dma_addr;
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 8d7744d..ef449c2 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -198,19 +198,61 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); }
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \
(sizeof(dma_addr_t) > sizeof(unsigned long))
- static inline dma_addr_t page_pool_get_dma_addr(struct page *page) {
- dma_addr_t ret = page->dma_addr[0];
- if (sizeof(dma_addr_t) > sizeof(unsigned long))
ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
dma_addr_t ret = page->dma_addr;
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
ret <<= 32;
ret |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK;
}
return ret; }
static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) {
- page->dma_addr[0] = addr;
- if (sizeof(dma_addr_t) > sizeof(unsigned long))
page->dma_addr[1] = upper_32_bits(addr);
- if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
atomic_long_set(&page->pp_frag_count, addr & PAGE_MASK);
addr >>= 32;
- }
- page->dma_addr = addr;
+}
+static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
long nr)
+{
- long frag_count = atomic_long_read(&page->pp_frag_count);
- long ret;
- if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
if ((frag_count & ~PAGE_MASK) == nr)
return 0;
ret = atomic_long_sub_return(nr, &page->pp_frag_count);
WARN_ON((ret & PAGE_MASK) != (frag_count & PAGE_MASK));
ret &= ~PAGE_MASK;
- } else {
if (frag_count == nr)
return 0;
ret = atomic_long_sub_return(nr, &page->pp_frag_count);
WARN_ON(ret < 0);
- }
- return ret;
+}
+static inline void page_pool_set_frag_count(struct page *page, long nr) +{
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
nr |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK;
atomic_long_set(&page->pp_frag_count, nr); }
static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 78838c6..0082f33 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -198,6 +198,16 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) if (dma_mapping_error(pool->p.dev, dma)) return false;
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
WARN_ON(pool->p.flags & PP_FLAG_PAGE_FRAG &&
dma & ~PAGE_MASK)) {
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order,
pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
return false;
}
page_pool_set_dma_addr(page, dma);
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
On Wed, 14 Jul 2021 at 13:18, Jesper Dangaard Brouer jbrouer@redhat.com wrote:
On 14/07/2021 11.34, Yunsheng Lin wrote:
As suggested by Alexander, "A DMA mapping should be page aligned anyway so the lower 12 bits would be reserved 0", so it might make more sense to repurpose the lower 12 bits of the dma address to store the frag count for frag page support in page pool for 32 bit systems with 64 bit dma, which should be rare those days.
Do we have any real driver users with 32-bit arch and 64-bit DMA, that want to use this new frag-count system you are adding to page_pool?
This "lower 12-bit use" complicates the code we need to maintain forever. My guess is that it is never used, but we need to update and maintain it, and it will never be tested.
Why don't you simply reject using page_pool flag PP_FLAG_PAGE_FRAG during setup of the page_pool for this case?
if ((pool->p.flags & PP_FLAG_PAGE_FRAG) && (sizeof(dma_addr_t) > sizeof(unsigned long))) goto reject-setup;
+1
For normal system, the dma_addr[1] in 'struct page' is not used, so we can reuse one of the dma_addr for storing frag count, which means how many frags this page might be splited to.
The PAGE_POOL_DMA_USE_PP_FRAG_COUNT macro is added to decide where to store the frag count, as the "sizeof(dma_addr_t) > sizeof(unsigned long)" is false for most systems those days, so hopefully the compiler will optimize out the unused code for those systems.
The newly added page_pool_set_frag_count() should be called before the page is passed to any user. Otherwise, call the newly added page_pool_atomic_sub_frag_count_return().
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com
include/linux/mm_types.h | 8 +++++-- include/net/page_pool.h | 54 ++++++++++++++++++++++++++++++++++++++++++------ net/core/page_pool.c | 10 +++++++++ 3 files changed, 64 insertions(+), 8 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d33d97c..82bcbb0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,11 +103,15 @@ struct page { unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad;
atomic_long_t pp_frag_count; /** * @dma_addr: might require a 64-bit value on
* 32-bit architectures.
* 32-bit architectures, if so, store the lower 32
* bits in pp_frag_count, and a DMA mapping should
* be page aligned, so the frag count can be stored
* in lower 12 bits for 4K page size. */
unsigned long dma_addr[2];
unsigned long dma_addr; }; struct { /* slab, slob and slub */ union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 8d7744d..ef449c2 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -198,19 +198,61 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); }
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \
(sizeof(dma_addr_t) > sizeof(unsigned long))
- static inline dma_addr_t page_pool_get_dma_addr(struct page *page) {
dma_addr_t ret = page->dma_addr[0];
if (sizeof(dma_addr_t) > sizeof(unsigned long))
ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
dma_addr_t ret = page->dma_addr;
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
ret <<= 32;
ret |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK;
}
return ret;
}
static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) {
page->dma_addr[0] = addr;
if (sizeof(dma_addr_t) > sizeof(unsigned long))
page->dma_addr[1] = upper_32_bits(addr);
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
atomic_long_set(&page->pp_frag_count, addr & PAGE_MASK);
addr >>= 32;
}
page->dma_addr = addr;
+}
+static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
long nr)
+{
long frag_count = atomic_long_read(&page->pp_frag_count);
long ret;
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
if ((frag_count & ~PAGE_MASK) == nr)
return 0;
ret = atomic_long_sub_return(nr, &page->pp_frag_count);
WARN_ON((ret & PAGE_MASK) != (frag_count & PAGE_MASK));
ret &= ~PAGE_MASK;
} else {
if (frag_count == nr)
return 0;
ret = atomic_long_sub_return(nr, &page->pp_frag_count);
WARN_ON(ret < 0);
}
return ret;
+}
+static inline void page_pool_set_frag_count(struct page *page, long nr) +{
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
nr |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK;
atomic_long_set(&page->pp_frag_count, nr);
}
static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 78838c6..0082f33 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -198,6 +198,16 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) if (dma_mapping_error(pool->p.dev, dma)) return false;
if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
WARN_ON(pool->p.flags & PP_FLAG_PAGE_FRAG &&
dma & ~PAGE_MASK)) {
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order,
pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
return false;
}
page_pool_set_dma_addr(page, dma); if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
On Wed, Jul 14, 2021 at 3:18 AM Jesper Dangaard Brouer jbrouer@redhat.com wrote:
On 14/07/2021 11.34, Yunsheng Lin wrote:
As suggested by Alexander, "A DMA mapping should be page aligned anyway so the lower 12 bits would be reserved 0", so it might make more sense to repurpose the lower 12 bits of the dma address to store the frag count for frag page support in page pool for 32 bit systems with 64 bit dma, which should be rare those days.
Do we have any real driver users with 32-bit arch and 64-bit DMA, that want to use this new frag-count system you are adding to page_pool?
This "lower 12-bit use" complicates the code we need to maintain forever. My guess is that it is never used, but we need to update and maintain it, and it will never be tested.
Why don't you simply reject using page_pool flag PP_FLAG_PAGE_FRAG during setup of the page_pool for this case?
if ((pool->p.flags & PP_FLAG_PAGE_FRAG) && (sizeof(dma_addr_t) > sizeof(unsigned long))) goto reject-setup;
That sounds good to me if we want to go that route. It would simplify this quite a bit since essentially we could just drop these if blocks.
Thanks.
- Alex
On 2021/7/14 22:46, Alexander Duyck wrote:
On Wed, Jul 14, 2021 at 3:18 AM Jesper Dangaard Brouer jbrouer@redhat.com wrote:
On 14/07/2021 11.34, Yunsheng Lin wrote:
As suggested by Alexander, "A DMA mapping should be page aligned anyway so the lower 12 bits would be reserved 0", so it might make more sense to repurpose the lower 12 bits of the dma address to store the frag count for frag page support in page pool for 32 bit systems with 64 bit dma, which should be rare those days.
Do we have any real driver users with 32-bit arch and 64-bit DMA, that want to use this new frag-count system you are adding to page_pool?
This "lower 12-bit use" complicates the code we need to maintain forever. My guess is that it is never used, but we need to update and maintain it, and it will never be tested.
Why don't you simply reject using page_pool flag PP_FLAG_PAGE_FRAG during setup of the page_pool for this case?
if ((pool->p.flags & PP_FLAG_PAGE_FRAG) && (sizeof(dma_addr_t) > sizeof(unsigned long))) goto reject-setup;
That sounds good to me if we want to go that route. It would simplify this quite a bit since essentially we could just drop these if blocks.
Ok, let's wait for a few day to see if there is anyone with 32-bit arch and 64-bit DMA system care enough to use the frag-count support in the page pool.
Thanks.
- Alex
.
Hi Yunsheng,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on net-next/master] [also build test ERROR on net/master linus/master v5.14-rc1 next-20210714] [cannot apply to sparc-next/master] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Yunsheng-Lin/add-frag-page-support-... base: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 5e437416ff66981d8154687cfdf7de50b1d82bfc config: sparc-defconfig (attached as .config) compiler: sparc-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/247943d70c2069ca3fa3a272f3eb26b463e1... git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Yunsheng-Lin/add-frag-page-support-in-page-pool/20210714-173612 git checkout 247943d70c2069ca3fa3a272f3eb26b463e17f4d # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=sparc
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com
All errors (new ones prefixed by >>):
In file included from include/linux/skbuff.h:40, from include/linux/if_ether.h:19, from include/linux/etherdevice.h:20, from arch/sparc/kernel/idprom.c:13: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: error: left shift count >= width of type [-Werror=shift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: error: right shift count >= width of type [-Werror=shift-count-overflow]
220 | addr >>= 32; | ^~~ cc1: all warnings being treated as errors
vim +209 include/net/page_pool.h
200 201 #define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ 202 (sizeof(dma_addr_t) > sizeof(unsigned long)) 203 204 static inline dma_addr_t page_pool_get_dma_addr(struct page *page) 205 { 206 dma_addr_t ret = page->dma_addr; 207 208 if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
209 ret <<= 32;
210 ret |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK; 211 } 212 213 return ret; 214 } 215 216 static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) 217 { 218 if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) { 219 atomic_long_set(&page->pp_frag_count, addr & PAGE_MASK);
220 addr >>= 32;
221 } 222 223 page->dma_addr = addr; 224 } 225
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Hi Yunsheng,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on net-next/master] [also build test WARNING on net/master linus/master v5.14-rc1 next-20210715] [cannot apply to sparc-next/master] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Yunsheng-Lin/add-frag-page-support-... base: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 5e437416ff66981d8154687cfdf7de50b1d82bfc config: m68k-buildonly-randconfig-r006-20210714 (attached as .config) compiler: m68k-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/247943d70c2069ca3fa3a272f3eb26b463e1... git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Yunsheng-Lin/add-frag-page-support-in-page-pool/20210714-173612 git checkout 247943d70c2069ca3fa3a272f3eb26b463e17f4d # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=m68k
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com
All warnings (new ones prefixed by >>):
In file included from include/linux/skbuff.h:40, from include/net/net_namespace.h:39, from include/linux/netdevice.h:37, from drivers/net/ethernet/wiznet/w5100.c:11: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ -- In file included from include/linux/skbuff.h:40, from include/linux/filter.h:13, from kernel/bpf/core.c:21: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ kernel/bpf/core.c: At top level: kernel/bpf/core.c:1356:12: warning: no previous prototype for 'bpf_probe_read_kernel' [-Wmissing-prototypes] 1356 | u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) | ^~~~~~~~~~~~~~~~~~~~~ -- In file included from include/linux/skbuff.h:40, from include/linux/filter.h:13, from include/linux/bpf_verifier.h:9, from kernel/bpf/verifier.c:12: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ In file included from include/linux/bpf_verifier.h:9, from kernel/bpf/verifier.c:12: kernel/bpf/verifier.c: In function 'jit_subprogs': include/linux/filter.h:363:4: warning: cast between incompatible function types from 'unsigned int (*)(const void *, const struct bpf_insn *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12209:16: note: in expansion of macro 'BPF_CAST_CALL' 12209 | insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - | ^~~~~~~~~~~~~ kernel/bpf/verifier.c: In function 'do_misc_fixups': include/linux/filter.h:363:4: warning: cast between incompatible function types from 'void * (* const)(struct bpf_map *, void *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12648:17: note: in expansion of macro 'BPF_CAST_CALL' 12648 | insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - | ^~~~~~~~~~~~~ include/linux/filter.h:363:4: warning: cast between incompatible function types from 'int (* const)(struct bpf_map *, void *, void *, u64)' {aka 'int (* const)(struct bpf_map *, void *, void *, long long unsigned int)'} to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12652:17: note: in expansion of macro 'BPF_CAST_CALL' 12652 | insn->imm = BPF_CAST_CALL(ops->map_update_elem) - | ^~~~~~~~~~~~~ include/linux/filter.h:363:4: warning: cast between incompatible function types from 'int (* const)(struct bpf_map *, void *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12656:17: note: in expansion of macro 'BPF_CAST_CALL' 12656 | insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - | ^~~~~~~~~~~~~ include/linux/filter.h:363:4: warning: cast between incompatible function types from 'int (* const)(struct bpf_map *, void *, u64)' {aka 'int (* const)(struct bpf_map *, void *, long long unsigned int)'} to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12660:17: note: in expansion of macro 'BPF_CAST_CALL' 12660 | insn->imm = BPF_CAST_CALL(ops->map_push_elem) - | ^~~~~~~~~~~~~ include/linux/filter.h:363:4: warning: cast between incompatible function types from 'int (* const)(struct bpf_map *, void *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12664:17: note: in expansion of macro 'BPF_CAST_CALL' 12664 | insn->imm = BPF_CAST_CALL(ops->map_pop_elem) - | ^~~~~~~~~~~~~ include/linux/filter.h:363:4: warning: cast between incompatible function types from 'int (* const)(struct bpf_map *, void *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12668:17: note: in expansion of macro 'BPF_CAST_CALL' 12668 | insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - | ^~~~~~~~~~~~~ include/linux/filter.h:363:4: warning: cast between incompatible function types from 'int (* const)(struct bpf_map *, u32, u64)' {aka 'int (* const)(struct bpf_map *, unsigned int, long long unsigned int)'} to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ kernel/bpf/verifier.c:12672:17: note: in expansion of macro 'BPF_CAST_CALL' 12672 | insn->imm = BPF_CAST_CALL(ops->map_redirect) - | ^~~~~~~~~~~~~ -- In file included from include/linux/skbuff.h:40, from include/linux/filter.h:13, from kernel/bpf/hashtab.c:8: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ In file included from kernel/bpf/hashtab.c:8: kernel/bpf/hashtab.c: In function 'htab_map_gen_lookup': include/linux/filter.h:363:4: warning: cast between incompatible function types from 'void * (*)(struct bpf_map *, void *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ include/linux/filter.h:371:14: note: in definition of macro 'BPF_EMIT_CALL' 371 | .imm = ((FUNC) - __bpf_call_base) }) | ^~~~ kernel/bpf/hashtab.c:641:26: note: in expansion of macro 'BPF_CAST_CALL' 641 | *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); | ^~~~~~~~~~~~~ kernel/bpf/hashtab.c: In function 'htab_lru_map_gen_lookup': include/linux/filter.h:363:4: warning: cast between incompatible function types from 'void * (*)(struct bpf_map *, void *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ include/linux/filter.h:371:14: note: in definition of macro 'BPF_EMIT_CALL' 371 | .imm = ((FUNC) - __bpf_call_base) }) | ^~~~ kernel/bpf/hashtab.c:682:26: note: in expansion of macro 'BPF_CAST_CALL' 682 | *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); | ^~~~~~~~~~~~~ kernel/bpf/hashtab.c: In function 'htab_of_map_gen_lookup': include/linux/filter.h:363:4: warning: cast between incompatible function types from 'void * (*)(struct bpf_map *, void *)' to 'u64 (*)(u64, u64, u64, u64, u64)' {aka 'long long unsigned int (*)(long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int, long long unsigned int)'} [-Wcast-function-type] 363 | ((u64 (*)(u64, u64, u64, u64, u64))(x)) | ^ include/linux/filter.h:371:14: note: in definition of macro 'BPF_EMIT_CALL' 371 | .imm = ((FUNC) - __bpf_call_base) }) | ^~~~ kernel/bpf/hashtab.c:2319:26: note: in expansion of macro 'BPF_CAST_CALL' 2319 | *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); | ^~~~~~~~~~~~~ -- In file included from include/linux/skbuff.h:40, from include/linux/filter.h:13, from include/linux/bpf_verifier.h:9, from kernel/bpf/btf.c:19: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ kernel/bpf/btf.c: In function 'btf_seq_show': kernel/bpf/btf.c:5694:22: warning: function 'btf_seq_show' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format] 5694 | seq_vprintf((struct seq_file *)show->target, fmt, args); | ^~~~~~~~ kernel/bpf/btf.c: In function 'btf_snprintf_show': kernel/bpf/btf.c:5731:2: warning: function 'btf_snprintf_show' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format] 5731 | len = vsnprintf(show->target, ssnprintf->len_left, fmt, args); | ^~~ -- In file included from include/linux/skbuff.h:40, from include/linux/if_ether.h:19, from include/uapi/linux/ethtool.h:19, from include/linux/ethtool.h:18, from include/linux/phy.h:16, from include/linux/fec.h:14, from arch/m68k/coldfire/device.c:16: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ arch/m68k/coldfire/device.c: At top level: arch/m68k/coldfire/device.c:512:13: error: 'MCFEDMA_BASE' undeclared here (not in a function); did you mean 'MCFQSPI_BASE'? 512 | .start = MCFEDMA_BASE, | ^~~~~~~~~~~~ | MCFQSPI_BASE arch/m68k/coldfire/device.c:513:26: error: 'MCFEDMA_SIZE' undeclared here (not in a function); did you mean 'MCFQSPI_SIZE'? 513 | .end = MCFEDMA_BASE + MCFEDMA_SIZE - 1, | ^~~~~~~~~~~~ | MCFQSPI_SIZE arch/m68k/coldfire/device.c:517:13: error: 'MCFEDMA_IRQ_INTR0' undeclared here (not in a function) 517 | .start = MCFEDMA_IRQ_INTR0, | ^~~~~~~~~~~~~~~~~ arch/m68k/coldfire/device.c:523:13: error: 'MCFEDMA_IRQ_INTR16' undeclared here (not in a function) 523 | .start = MCFEDMA_IRQ_INTR16, | ^~~~~~~~~~~~~~~~~~ arch/m68k/coldfire/device.c:529:13: error: 'MCFEDMA_IRQ_INTR56' undeclared here (not in a function) 529 | .start = MCFEDMA_IRQ_INTR56, | ^~~~~~~~~~~~~~~~~~ arch/m68k/coldfire/device.c:535:13: error: 'MCFEDMA_IRQ_ERR' undeclared here (not in a function) 535 | .start = MCFEDMA_IRQ_ERR, | ^~~~~~~~~~~~~~~ -- In file included from include/linux/skbuff.h:40, from include/net/net_namespace.h:39, from include/linux/init_task.h:18, from arch/m68k/kernel/process.c:30: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ arch/m68k/kernel/process.c: At top level: arch/m68k/kernel/process.c:115:16: warning: no previous prototype for 'm68k_clone' [-Wmissing-prototypes] 115 | asmlinkage int m68k_clone(struct pt_regs *regs) | ^~~~~~~~~~ arch/m68k/kernel/process.c:136:16: warning: no previous prototype for 'm68k_clone3' [-Wmissing-prototypes] 136 | asmlinkage int m68k_clone3(struct pt_regs *regs) | ^~~~~~~~~~~ arch/m68k/kernel/process.c:215:5: warning: no previous prototype for 'dump_fpu' [-Wmissing-prototypes] 215 | int dump_fpu (struct pt_regs *regs, struct user_m68kfp_struct *fpu) | ^~~~~~~~ -- In file included from include/linux/skbuff.h:40, from include/linux/filter.h:13, from kernel/kallsyms.c:25: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ kernel/kallsyms.c: At top level: kernel/kallsyms.c:502:12: warning: no previous prototype for 'arch_get_kallsym' [-Wmissing-prototypes] 502 | int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value, | ^~~~~~~~~~~~~~~~ -- kernel/fork.c:162:13: warning: no previous prototype for 'arch_release_task_struct' [-Wmissing-prototypes] 162 | void __weak arch_release_task_struct(struct task_struct *tsk) | ^~~~~~~~~~~~~~~~~~~~~~~~ kernel/fork.c:752:20: warning: no previous prototype for 'arch_task_cache_init' [-Wmissing-prototypes] 752 | void __init __weak arch_task_cache_init(void) { } | ^~~~~~~~~~~~~~~~~~~~ kernel/fork.c:847:12: warning: no previous prototype for 'arch_dup_task_struct' [-Wmissing-prototypes] 847 | int __weak arch_dup_task_struct(struct task_struct *dst, | ^~~~~~~~~~~~~~~~~~~~ In file included from include/linux/skbuff.h:40, from include/net/net_namespace.h:39, from include/linux/init_task.h:18, from kernel/fork.c:983: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ -- In file included from include/linux/skbuff.h:40, from include/net/net_namespace.h:39, from include/linux/init_task.h:18, from kernel/exit.c:55: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ kernel/exit.c: At top level: kernel/exit.c:1810:13: warning: no previous prototype for 'abort' [-Wmissing-prototypes] 1810 | __weak void abort(void) | ^~~~~ -- In file included from include/linux/skbuff.h:40, from include/linux/if_ether.h:19, from include/uapi/linux/ethtool.h:19, from include/linux/ethtool.h:18, from include/linux/phy.h:16, from include/linux/acpi_mdio.h:9, from drivers/net/ethernet/marvell/mvmdio.c:21: include/net/page_pool.h: In function 'page_pool_get_dma_addr':
include/net/page_pool.h:209:7: warning: left shift count >= width of type [-Wshift-count-overflow]
209 | ret <<= 32; | ^~~ include/net/page_pool.h: In function 'page_pool_set_dma_addr':
include/net/page_pool.h:220:8: warning: right shift count >= width of type [-Wshift-count-overflow]
220 | addr >>= 32; | ^~~ At top level: drivers/net/ethernet/marvell/mvmdio.c:432:36: warning: 'orion_mdio_acpi_match' defined but not used [-Wunused-const-variable=] 432 | static const struct acpi_device_id orion_mdio_acpi_match[] = { | ^~~~~~~~~~~~~~~~~~~~~ ..
vim +209 include/net/page_pool.h
200 201 #define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ 202 (sizeof(dma_addr_t) > sizeof(unsigned long)) 203 204 static inline dma_addr_t page_pool_get_dma_addr(struct page *page) 205 { 206 dma_addr_t ret = page->dma_addr; 207 208 if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
209 ret <<= 32;
210 ret |= atomic_long_read(&page->pp_frag_count) & PAGE_MASK; 211 } 212 213 return ret; 214 } 215 216 static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) 217 { 218 if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) { 219 atomic_long_set(&page->pp_frag_count, addr & PAGE_MASK);
220 addr >>= 32;
221 } 222 223 page->dma_addr = addr; 224 } 225
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Currently page pool only support page recycling when there is only one user of the page, and the split page reusing implemented in the most driver can not use the page pool as bing-pong way of reusing requires the multi user support in page pool.
Those reusing or recycling has below limitations: 1. page from page pool can only be used be one user in order for the page recycling to happen. 2. Bing-pong way of reusing in most driver does not support multi desc using different part of the same page in order to save memory.
So add multi-users support and frag page recycling in page pool to overcome the above limitation.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- include/net/page_pool.h | 22 +++++++++- net/core/page_pool.c | 109 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 117 insertions(+), 14 deletions(-)
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index ef449c2..3159b3a 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -45,7 +45,10 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGE_FRAG)
/* * Fast allocation side cache array/stack @@ -88,6 +91,9 @@ struct page_pool { unsigned long defer_warn;
u32 pages_state_hold_cnt; + unsigned int frag_offset; + long frag_allocated; + struct page *frag_page;
/* * Data structure for allocation side @@ -137,6 +143,20 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); }
+struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, + gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, size, gfp); +} + /* get the stored dma direction. A driver might decide to treat this locally and * avoid the extra cache line from page_pool to determine the direction */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 0082f33..e89434c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -24,6 +24,8 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ)
+#define BIAS_MAX (PAGE_SIZE - 1) + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -67,6 +69,14 @@ static int page_pool_init(struct page_pool *pool, */ }
+ /* Make sure there is at least one bias left as we depend on that + * to ensure the frag page is reserved to serve more users. + */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + (PAGE_SIZE << pool->p.order > + dma_get_cache_alignment() * (BIAS_MAX - 1))) + return -EINVAL; + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM;
@@ -429,6 +439,11 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + /* It is not the last user for the page frag case */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + page_pool_atomic_sub_frag_count_return(page, 1)) + return NULL; + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -452,19 +467,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, /* Page found as candidate for recycling */ return page; } - /* Fallback/non-XDP mode: API user have elevated refcnt. - * - * Many drivers split up the page into fragments, and some - * want to keep doing this to save memory and do refcnt based - * recycling. Support this use case too, to ease drivers - * switching between XDP/non-XDP. - * - * In-case page_pool maintains the DMA mapping, API user must - * call page_pool_put_page once. In this elevated refcnt - * case, the DMA is unmapped/released, as driver is likely - * doing refcnt based recycle tricks, meaning another process - * will be invoking put_page. - */ + /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); @@ -521,6 +524,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } EXPORT_SYMBOL(page_pool_put_page_bulk);
+static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_allocated; + + /* page pool is not the last user */ + if (page_pool_atomic_sub_frag_count_return(page, drain_count)) + return NULL; + + if (likely(page_ref_count(page) == 1 && + !page_is_pfmemalloc(page))) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_allocated; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || + page_pool_atomic_sub_frag_count_return(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) + goto frag_reset; + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_allocated = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_set_frag_count(page, BIAS_MAX); + return page; + } + + pool->frag_allocated++; + pool->frag_offset = *offset + size; + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -626,6 +707,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return;
+ page_pool_free_frag(pool); + if (!page_pool_release(pool)) return;
This patch adds skb's frag page recycling support based on the frag page support in page pool.
The performance improves above 10~20% for single thread iperf TCP flow with IOMMU disabled when iperf server and irq/NAPI have a different CPU.
The performance improves about 135%(14Gbit to 33Gbit) for single thread iperf TCP flow IOMMU is in strict mode and iperf server shares the same cpu with irq/NAPI.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 82 +++++++++++++++++++++++-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 3 + 2 files changed, 80 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index cdb5f14..f3f9b13 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3205,6 +3205,21 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring, unsigned int order = hns3_page_order(ring); struct page *p;
+ if (ring->page_pool) { + p = page_pool_dev_alloc_frag(ring->page_pool, + &cb->page_offset, + hns3_buf_size(ring)); + if (unlikely(!p)) + return -ENOMEM; + + cb->priv = p; + cb->buf = page_address(p); + cb->dma = page_pool_get_dma_addr(p); + cb->type = DESC_TYPE_FRAG; + cb->reuse_flag = 0; + return 0; + } + p = dev_alloc_pages(order); if (!p) return -ENOMEM; @@ -3227,8 +3242,13 @@ static void hns3_free_buffer(struct hns3_enet_ring *ring, if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB)) napi_consume_skb(cb->priv, budget); - else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias) - __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); + else if (!HNAE3_IS_TX_RING(ring)) { + if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias) + __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); + else if (cb->type & DESC_TYPE_FRAG) + page_pool_put_full_page(ring->page_pool, cb->priv, + false); + } memset(cb, 0, sizeof(*cb)); }
@@ -3315,7 +3335,7 @@ static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring, int ret;
ret = hns3_alloc_buffer(ring, cb); - if (ret) + if (ret || ring->page_pool) goto out;
ret = hns3_map_buffer(ring, cb); @@ -3337,7 +3357,8 @@ static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i) if (ret) return ret;
- ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + + ring->desc_cb[i].page_offset);
return 0; } @@ -3367,7 +3388,8 @@ static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i, { hns3_unmap_buffer(ring, &ring->desc_cb[i]); ring->desc_cb[i] = *res_cb; - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + + ring->desc_cb[i].page_offset); ring->desc[i].rx.bd_base_info = 0; }
@@ -3539,6 +3561,12 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, u32 frag_size = size - pull_len; bool reused;
+ if (ring->page_pool) { + skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, + frag_size, truesize); + return; + } + /* Avoid re-using remote or pfmem page */ if (unlikely(!dev_page_is_reusable(desc_cb->priv))) goto out; @@ -3856,6 +3884,9 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, /* We can reuse buffer as-is, just make sure it is reusable */ if (dev_page_is_reusable(desc_cb->priv)) desc_cb->reuse_flag = 1; + else if (desc_cb->type & DESC_TYPE_FRAG) + page_pool_put_full_page(ring->page_pool, desc_cb->priv, + false); else /* This page cannot be reused so discard it */ __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); @@ -3863,6 +3894,10 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, hns3_rx_ring_move_fw(ring); return 0; } + + if (ring->page_pool) + skb_mark_for_recycle(skb); + u64_stats_update_begin(&ring->syncp); ring->stats.seg_pkt_cnt++; u64_stats_update_end(&ring->syncp); @@ -3901,6 +3936,10 @@ static int hns3_add_frag(struct hns3_enet_ring *ring) "alloc rx fraglist skb fail\n"); return -ENXIO; } + + if (ring->page_pool) + skb_mark_for_recycle(new_skb); + ring->frag_num = 0;
if (ring->tail_skb) { @@ -4705,6 +4744,31 @@ static void hns3_put_ring_config(struct hns3_nic_priv *priv) priv->ring = NULL; }
+static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) +{ + struct page_pool_params pp_params = { + .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG | + PP_FLAG_DMA_SYNC_DEV, + .order = hns3_page_order(ring), + .pool_size = ring->desc_num * hns3_buf_size(ring) / + (PAGE_SIZE << hns3_page_order(ring)), + .nid = dev_to_node(ring_to_dev(ring)), + .dev = ring_to_dev(ring), + .dma_dir = DMA_FROM_DEVICE, + .offset = 0, + .max_len = PAGE_SIZE << hns3_page_order(ring), + }; + + ring->page_pool = page_pool_create(&pp_params); + if (IS_ERR(ring->page_pool)) { + dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n", + PTR_ERR(ring->page_pool)); + ring->page_pool = NULL; + } else { + dev_info(ring_to_dev(ring), "page pool creation succeeded\n"); + } +} + static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) { int ret; @@ -4724,6 +4788,8 @@ static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) goto out_with_desc_cb;
if (!HNAE3_IS_TX_RING(ring)) { + hns3_alloc_page_pool(ring); + ret = hns3_alloc_ring_buffers(ring); if (ret) goto out_with_desc; @@ -4764,6 +4830,12 @@ void hns3_fini_ring(struct hns3_enet_ring *ring) devm_kfree(ring_to_dev(ring), tx_spare); ring->tx_spare = NULL; } + + if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) { + page_pool_destroy(ring->page_pool); + ring->page_pool = NULL; + dev_info(ring_to_dev(ring), "page pool destroyed\n"); + } }
static int hns3_buf_size2type(u32 buf_size) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 15af3d9..115c0ce 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -6,6 +6,7 @@
#include <linux/dim.h> #include <linux/if_vlan.h> +#include <net/page_pool.h>
#include "hnae3.h"
@@ -307,6 +308,7 @@ enum hns3_desc_type { DESC_TYPE_BOUNCE_ALL = 1 << 3, DESC_TYPE_BOUNCE_HEAD = 1 << 4, DESC_TYPE_SGL_SKB = 1 << 5, + DESC_TYPE_FRAG = 1 << 6, };
struct hns3_desc_cb { @@ -451,6 +453,7 @@ struct hns3_enet_ring { struct hnae3_queue *tqp; int queue_index; struct device *dev; /* will be used for DMA mapping of descriptors */ + struct page_pool *page_pool;
/* statistic */ struct ring_stats stats;