From: Yunfeng Ye <yeyunfeng(a)huawei.com>
euleros inclusion
category: feature
feature: pagecache percpu refcount
bugzilla: 31398
CVE: NA
-------------------------------------------------
The pagecache manages the file physical pages, and the life cycle of
page is managed by atomic counting. With the increasing number of cpu
cores, the cost of atomic counting is very large when reading file
pagecaches at large concurrent.
For example, when running nginx http application, the biggest hotspot is
found in the atomic operation of find_get_entry():
11.94% [kernel] [k] find_get_entry
7.45% [kernel] [k] do_tcp_sendpages
6.12% [kernel] [k] generic_file_buffered_read
So we using the percpu refcount mechanism to fix this problem. and the
test result show that the read performance of nginx http can be improved
by 100%:
worker original(requests/sec) percpu(requests/sec) imporve
64 759656.87 1627088.95 114.2%
Notes: we use page->lru to save percpu information, so the pages with
percpu attribute will not be recycled by memory recycling process, we
should avoid grow the file size unlimited.
Signed-off-by: Yunfeng Ye <yeyunfeng(a)huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
fs/fcntl.c | 20 ++++++++++++
include/linux/mm.h | 23 +++++++++++++
include/linux/page-flags.h | 2 ++
include/linux/page_ref.h | 11 +++++++
include/linux/pagemap.h | 24 ++++++++++++++
include/trace/events/mmflags.h | 3 +-
include/uapi/linux/fcntl.h | 2 ++
mm/filemap.c | 74 ++++++++++++++++++++++++++++++++++++++++--
mm/swap.c | 2 ++
9 files changed, 158 insertions(+), 3 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 4137d96..0c70a8e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -29,6 +29,7 @@
#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>
+#include <linux/pagemap.h>
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
@@ -319,6 +320,22 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd,
}
}
+static long fcntl_mapping_percpu(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct address_space *mapping = filp->f_mapping;
+ unsigned long flag = arg;
+
+ if (!mapping)
+ return -EINVAL;
+
+ if (flag)
+ mapping_set_percpu_ref(mapping);
+ else
+ mapping_clear_percpu_ref(mapping);
+ return 0;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -426,6 +443,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SET_FILE_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
+ case F_MAPPING_PERCPU:
+ err = fcntl_mapping_percpu(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65d91b1..0e173a4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -528,6 +528,10 @@ static inline int pgd_devmap(pgd_t pgd)
static inline int put_page_testzero(struct page *page)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+ if (PagePercpuRef(page)) {
+ percpu_ref_put(page_percpu_ref(page));
+ return 0;
+ }
return page_ref_dec_and_test(page);
}
@@ -539,6 +543,10 @@ static inline int put_page_testzero(struct page *page)
*/
static inline int get_page_unless_zero(struct page *page)
{
+ if (PagePercpuRef(page)) {
+ percpu_ref_get(page_percpu_ref(page));
+ return true;
+ }
return page_ref_add_unless(page, 1, 0);
}
@@ -928,6 +936,11 @@ static inline bool is_device_public_page(const struct page *page)
static inline void get_page(struct page *page)
{
page = compound_head(page);
+
+ if (PagePercpuRef(page)) {
+ percpu_ref_get(page_percpu_ref(page));
+ return;
+ }
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_refcount.
@@ -939,6 +952,11 @@ static inline void get_page(struct page *page)
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
+
+ if (PagePercpuRef(page)) {
+ percpu_ref_get(page_percpu_ref(page));
+ return true;
+ }
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
return false;
page_ref_inc(page);
@@ -949,6 +967,11 @@ static inline void put_page(struct page *page)
{
page = compound_head(page);
+ if (PagePercpuRef(page)) {
+ percpu_ref_put(page_percpu_ref(page));
+ return;
+ }
+
/*
* For devmap managed pages we need to catch refcount transition from
* 2 to 1, when refcount reach one it means the page is free and we
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3f066ce..7eb776a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,6 +101,7 @@ enum pageflags {
PG_young,
PG_idle,
#endif
+ PG_percpu_ref,
__NR_PAGEFLAGS,
/* Filesystems */
@@ -385,6 +386,7 @@ static inline bool set_hwpoison_free_buddy_page(struct page *page)
TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif
+PAGEFLAG(PercpuRef, percpu_ref, PF_ANY)
/*
* On an anonymous page mapped into a user virtual memory area,
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 14d14be..3deab40 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -180,4 +180,15 @@ static inline void page_ref_unfreeze(struct page *page, int count)
__page_ref_unfreeze(page, count);
}
+static inline struct percpu_ref *page_percpu_ref(struct page *page)
+{
+ return *(struct percpu_ref **)&page->lru;
+}
+
+static inline void page_set_percpu_ref(struct page *page,
+ struct percpu_ref *ref)
+{
+ *(struct percpu_ref **)&page->lru = ref;
+}
+
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 520627f..e889d99 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -29,6 +29,7 @@ enum mapping_flags {
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
+ AS_PERCPU_REF = 6, /* percpu ref counter for special inode */
};
/**
@@ -97,6 +98,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping)
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
+static inline void mapping_set_percpu_ref(struct address_space *mapping)
+{
+ set_bit(AS_PERCPU_REF, &mapping->flags);
+}
+
+static inline void mapping_clear_percpu_ref(struct address_space *mapping)
+{
+ clear_bit(AS_PERCPU_REF, &mapping->flags);
+}
+
+static inline int mapping_percpu_ref(struct address_space *mapping)
+{
+ return test_bit(AS_PERCPU_REF, &mapping->flags);
+}
+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
@@ -170,6 +186,10 @@ static inline int page_cache_get_speculative(struct page *page)
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
+ if (PagePercpuRef(page)) {
+ percpu_ref_get(page_percpu_ref(page));
+ return 1;
+ }
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
* this for us.
@@ -183,6 +203,10 @@ static inline int page_cache_get_speculative(struct page *page)
page_ref_inc(page);
#else
+ if (PagePercpuRef(page)) {
+ percpu_ref_get(page_percpu_ref(page));
+ return 1;
+ }
if (unlikely(!get_page_unless_zero(page))) {
/*
* Either the page has been freed, or will be freed.
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a81cffb..2994f1c 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -104,7 +104,8 @@
IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
-IF_HAVE_PG_IDLE(PG_idle, "idle" )
+IF_HAVE_PG_IDLE(PG_idle, "idle"), \
+ {1UL << PG_percpu_ref, "percpu_ref" }
#define show_page_flags(flags) \
(flags) ? __print_flags(flags, "|", \
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd..6dcddf7 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -53,6 +53,8 @@
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
+#define F_MAPPING_PERCPU (F_LINUX_SPECIFIC_BASE + 15)
+
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
diff --git a/mm/filemap.c b/mm/filemap.c
index c56c419..8a8bf78 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -48,6 +48,66 @@
#include <asm/mman.h>
+struct percpu_page {
+ struct percpu_ref ref;
+ struct page *page;
+};
+
+static void free_page_ref(struct percpu_ref *ref)
+{
+ struct percpu_page *p = (struct percpu_page *)ref;
+ struct page *page = p->page;
+
+ percpu_ref_exit(ref);
+ kfree(page_percpu_ref(page));
+ page_set_percpu_ref(page, NULL);
+
+ ClearPagePercpuRef(page);
+ /* really free the page */
+ put_page(page);
+}
+
+static void page_cache_init(struct address_space *mapping, struct page *page)
+{
+ struct percpu_page *p;
+
+ if (!mapping_percpu_ref(mapping))
+ return;
+
+ p = kzalloc(sizeof(struct percpu_page), GFP_KERNEL);
+ if (!p)
+ return;
+ if (percpu_ref_init(&p->ref, free_page_ref, 0, GFP_KERNEL))
+ goto err;
+
+ p->page = page;
+ page_set_percpu_ref(page, &p->ref);
+ SetPagePercpuRef(page);
+ get_page(page);
+ return;
+err:
+ kfree(p);
+}
+
+static void page_cache_exit(struct page *page)
+{
+ if (!PagePercpuRef(page))
+ return;
+
+ put_page(page);
+ ClearPagePercpuRef(page);
+ percpu_ref_exit(page_percpu_ref(page));
+ kfree(page_percpu_ref(page));
+ page_set_percpu_ref(page, NULL);
+}
+
+static void page_cache_kill(struct page *page)
+{
+ if (!PagePercpuRef(page))
+ return;
+ percpu_ref_kill(page_percpu_ref(page));
+}
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -264,6 +324,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
unaccount_page_cache_page(mapping, page);
page_cache_tree_delete(mapping, page, shadow);
+ page_cache_kill(page);
}
static void page_cache_free_page(struct address_space *mapping,
@@ -384,8 +445,10 @@ void delete_from_page_cache_batch(struct address_space *mapping,
page_cache_tree_delete_batch(mapping, pvec);
xa_unlock_irqrestore(&mapping->i_pages, flags);
- for (i = 0; i < pagevec_count(pvec); i++)
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ page_cache_kill(pvec->pages[i]);
page_cache_free_page(mapping, pvec->pages[i]);
+ }
}
int filemap_check_errors(struct address_space *mapping)
@@ -966,7 +1029,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
workingset_activation(page);
} else
ClearPageActive(page);
- lru_cache_add(page);
+ if (!PagePercpuRef(page))
+ lru_cache_add(page);
}
return ret;
}
@@ -1630,8 +1694,10 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
+ page_cache_init(mapping, page);
err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
if (unlikely(err)) {
+ page_cache_exit(page);
put_page(page);
page = NULL;
if (err == -EEXIST)
@@ -2320,9 +2386,11 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
error = -ENOMEM;
goto out;
}
+ page_cache_init(mapping, page);
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
+ page_cache_exit(page);
put_page(page);
if (error == -EEXIST) {
error = 0;
@@ -2837,8 +2905,10 @@ static struct page *do_read_cache_page(struct address_space *mapping,
page = __page_cache_alloc(gfp);
if (!page)
return ERR_PTR(-ENOMEM);
+ page_cache_init(mapping, page);
err = add_to_page_cache_lru(page, mapping, index, gfp);
if (unlikely(err)) {
+ page_cache_exit(page);
put_page(page);
if (err == -EEXIST)
goto repeat;
diff --git a/mm/swap.c b/mm/swap.c
index 45fdbfb..320ac35 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -372,6 +372,8 @@ static void __lru_cache_activate_page(struct page *page)
void mark_page_accessed(struct page *page)
{
page = compound_head(page);
+ if (PagePercpuRef(page))
+ return;
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
--
1.8.3