添加THP支持,并补充部分原生numa balance策略控制
Ze Zuo (6): mm: thp: refactor migrate_misplaced_page for thp migrate mm: migrate: account THP NUMA migration counters correctly mm: migrate: don't split THP for misplaced NUMA page mm: migrate: Skip spe access migration for Shared THP with Multiple Mappings mm: migrate: THP are supported in NUMA balancing under hardware sampling mm: migrate: backport some migrate policy judge
mm/mem_sampling.c | 75 +++++++++++++++++++++++++++++++++++++++++++++-- mm/migrate.c | 51 +++++++++++++++++++++++++++----- 2 files changed, 116 insertions(+), 10 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/10298 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/G...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/10298 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/G...
From: Ze Zuo zuoze1@huawei.com
Introduce the function alloc_misplaced_dst_page_thp to support THP NUMA page migration in migrate_misplaced_page().
Signed-off-by: Ze Zuo zuoze1@huawei.com --- mm/migrate.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c index c8491a744e8c..d55d0d1a6b89 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2082,6 +2082,23 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; }
+static struct page *alloc_misplaced_dst_page_thp(struct page *page, + unsigned long data) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), + HPAGE_PMD_ORDER); + if (!newpage) + goto out; + + prep_transhuge_page(newpage); + +out: + return newpage; +} + static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; @@ -2141,6 +2158,20 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int isolated; int nr_remaining; LIST_HEAD(migratepages); + new_page_t *new; + bool compound; + + /* + * PTE mapped THP or HugeTLB page can't reach here so the page could + * be either base page or THP. And it must be head page if it is + * THP. + */ + compound = PageTransHuge(page); + + if (compound) + new = alloc_misplaced_dst_page_thp; + else + new = alloc_misplaced_dst_page;
/* * Don't migrate file pages that are mapped in multiple processes @@ -2162,9 +2193,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out;
list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - NULL, node, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); + nr_remaining = migrate_pages(&migratepages, *new, NULL, node, + MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru);
From: Ze Zuo zuoze1@huawei.com
Now both base page and THP NUMA migration is also done via migrate_misplaced_page(), keep the counters correctly for THP.
Signed-off-by: Ze Zuo zuoze1@huawei.com --- mm/migrate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c index d55d0d1a6b89..a3f9cbbfbfe5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2160,6 +2160,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, LIST_HEAD(migratepages); new_page_t *new; bool compound; + unsigned int nr_pages = thp_nr_pages(page);
/* * PTE mapped THP or HugeTLB page can't reach here so the page could @@ -2198,13 +2199,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_lru(page), -nr_pages); putback_lru_page(page); } isolated = 0; } else - count_vm_numa_event(NUMA_PAGE_MIGRATE); + count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages); BUG_ON(!list_empty(&migratepages)); return isolated;
From: Ze Zuo zuoze1@huawei.com
The NUMA fault behavior didn't split THP if migration is failed due to lack of memory on the target node. But the THP migration does split THP, so keep the old behavior for misplaced NUMA page migration.
Signed-off-by: Ze Zuo zuoze1@huawei.com --- mm/migrate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mm/migrate.c b/mm/migrate.c index a3f9cbbfbfe5..4c2a1d99db53 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1487,6 +1487,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; + bool nosplit = (reason == MR_NUMA_MISPLACED);
if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -1527,8 +1528,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, * pages are added to the tail of the list so * we encounter them after the rest of the list * is processed. + * THP NUMA acces migrate doesn't split THP to retry. */ - if (is_thp) { + if (is_thp && !nosplit) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page);
From: Ze Zuo zuoze1@huawei.com
the THP numa fault actually prevents from migrating shared THP (mapped by multiple processes), so bail out early if mapcount is > 1 to keep the behavior.
Signed-off-by: Ze Zuo zuoze1@huawei.com --- mm/migrate.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/mm/migrate.c b/mm/migrate.c index 4c2a1d99db53..e71683eaa8b5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2107,6 +2107,10 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ /* Do not migrate THP mapped by multiple processes */ + if (PageTransHuge(page) && total_mapcount(page) > 1) + return 0; + /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) return 0;
From: Ze Zuo zuoze1@huawei.com
In the NUMA balancing scenario, support for PMD-level page migration, similar to the "do_huge_pmd_numa_page" function in the auto NUMA scheme.
Signed-off-by: Ze Zuo zuoze1@huawei.com --- mm/mem_sampling.c | 66 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-)
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c index 0eaea2680d83..23f3eb28c5e6 100644 --- a/mm/mem_sampling.c +++ b/mm/mem_sampling.c @@ -144,6 +144,41 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); }
+/* NUMA hinting page access point for trans huge pmds */ +void do_huge_pmd_numa_access(struct vm_area_struct *vma, u64 vaddr, struct page *page) +{ + int page_nid = NUMA_NO_NODE; + int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); + bool migrated = false; + int flags = 0; + u64 haddr = vaddr & HPAGE_PMD_MASK; + + page = compound_head(page); + page_nid = page_to_nid(page); + last_cpupid = page_cpupid_last(page); + target_nid = numa_migrate_prep(page, vma, haddr, page_nid, + &flags); + + if (target_nid == NUMA_NO_NODE) { + put_page(page); + goto out; + } + + migrated = migrate_misplaced_page(page, vma, target_nid); + if (migrated) { + flags |= TNF_MIGRATED; + page_nid = target_nid; + } else { + flags |= TNF_MIGRATE_FAIL; + } + +out: + trace_mm_numa_migrating(haddr, page_nid, target_nid, flags&TNF_MIGRATED); + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + flags); +} + /* * Called from task_work context to act upon the page access. * @@ -161,6 +196,11 @@ static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr) int last_cpupid; int target_nid; int flags = 0; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd = NULL; + pmd_t pmde;
if (!mm) return; @@ -190,9 +230,31 @@ static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr) if (unlikely(!PageLRU(page))) goto out_unlock;
- /* TODO: handle PTE-mapped THP or PMD-mapped THP*/ - if (PageCompound(page)) + if (PageCompound(page)) { + pgd = pgd_offset(mm, vaddr); + if (!pgd_present(*pgd)) + goto out_unlock; + + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) + goto out_unlock; + + pud = pud_offset(p4d, vaddr); + if (!pud_present(*pud)) + goto out_unlock; + + pmd = pmd_offset(pud, vaddr); + if (!pmd) + goto out_unlock; + pmde = *pmd; + + barrier(); + if (pmd_trans_huge(pmde) || pmd_devmap(pmde)) + /* handle PMD-mapped THP */ + do_huge_pmd_numa_access(vma, vaddr, page); + /* TODO: handle PTE-mapped THP */ goto out_unlock; + }
/* * Flag if the page is shared between multiple address spaces. This
From: Ze Zuo zuoze1@huawei.com
skip cow page, and ksm page for base page.
Signed-off-by: Ze Zuo zuoze1@huawei.com --- mm/mem_sampling.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c index 23f3eb28c5e6..bbf6e6ed9229 100644 --- a/mm/mem_sampling.c +++ b/mm/mem_sampling.c @@ -24,6 +24,8 @@ #include <linux/sched/numa_balancing.h> #include <trace/events/kmem.h>
+#include "internal.h" + struct mem_sampling_ops_struct mem_sampling_ops;
static int mem_sampling; @@ -224,7 +226,7 @@ static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr) goto out_unlock;
page = pfn_to_online_page(PHYS_PFN(paddr)); - if (!page || is_zone_device_page(page)) + if (!page || is_zone_device_page(page) || PageKsm(page)) goto out_unlock;
if (unlikely(!PageLRU(page))) @@ -263,6 +265,11 @@ static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr) if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED;
+ /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + page_count(page) != 1) + goto out_unlock; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page);