From: Wang Wensheng <wangwensheng4@huawei.com> hulk inclusion category: feature category: bugfix bugzilla: NA ---------------------------------------- Don't use hugetlb_no_page() to allocate sharepool hugepages. Because we want to demote 1G-sized hugetlb pages while there is no 2M-sized hugetlb pages. We allocate hugepages directly via alloc_hugetlb_folio_nodemask_size() and map those hugepages via hugetlb_insert_hugepage_pte(), just the same as what we do in k2u. Fixes: 00c7c3d64806 ("hugetlb: support auto demote and promote") Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com> --- mm/share_pool.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 2 deletions(-) diff --git a/mm/share_pool.c b/mm/share_pool.c index 100446279620..9ff7a2da3dfc 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -700,7 +700,10 @@ struct sp_area { struct sp_group *spg; struct sp_mapping *spm; /* where spa born from */ enum spa_type type; - unsigned long kva; /* shared kva */ + union { + unsigned long kva; /* shared kva */ + struct page **pages; /* for hugetlb alloc */ + }; pid_t applier; /* the original applier process */ int preferred_node_id; /* memory node */ struct work_struct work; @@ -1497,6 +1500,65 @@ static bool sp_group_delete_area(struct sp_group *spg, struct sp_area *spa) return atomic_dec_and_test(&spa->spg->spa_num); } +static bool sp_area_alloc_hugepage_enable __read_mostly = true; + +static int __init sp_area_alloc_hugepage_disable(char *p) +{ + sp_area_alloc_hugepage_enable = false; + + return 1; +} +__setup("sp_area_alloc_hugepage_disable", sp_area_alloc_hugepage_disable); + +static bool sp_area_need_hugepage(struct sp_area *spa) +{ + return sp_area_alloc_hugepage_enable && spa->type == SPA_TYPE_ALLOC && spa->is_hugepage; +} + +static bool sp_area_alloc_hugepages(struct sp_area *spa, int nid, nodemask_t *nodemask) +{ + int i; + struct page **pages; + int nr_pages = ALIGN(spa_size(spa), PMD_SIZE) / PMD_SIZE; + + pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return false; + + for (i = 0; i < nr_pages; i++) { + pages[i] = (struct page *)alloc_hugetlb_folio_nodemask_size(PMD_SIZE, + nid, nodemask); + if (!pages[i]) { + while (i--) + put_page(pages[i]); + kvfree(pages); + return false; + } + memset(page_to_virt(pages[i]), 0, PMD_SIZE); + } + + spa->pages = pages; + + return true; +} + +static void sp_area_free_hugepages(struct sp_area *spa) +{ + int nr_pages = ALIGN(spa->real_size, PMD_SIZE) / PMD_SIZE; + + if (!sp_area_need_hugepage(spa)) + return; + + if (!spa->pages) + return; + + while (nr_pages--) + put_page(spa->pages[nr_pages]); + + kvfree(spa->pages); + spa->pages = NULL; +} + /** * sp_area_alloc() - Allocate a region of VA from the share pool. * @size: the size of VA to allocate. @@ -1721,6 +1783,7 @@ static void sp_area_free(struct sp_area *spa) rb_erase(&spa->rb_node, &spm->area_root); spin_unlock(&spm->sp_mapping_lock); RB_CLEAR_NODE(&spa->rb_node); + sp_area_free_hugepages(spa); kfree(spa); } @@ -1924,11 +1987,32 @@ int mg_sp_free(unsigned long addr, int id) } EXPORT_SYMBOL_GPL(mg_sp_free); +static int sp_vma_insert_hugepages(struct vm_area_struct *vma, struct page **pages, + unsigned long uaddr, unsigned long size) +{ + int i = 0; + + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + do { + int ret = hugetlb_insert_hugepage_pte(vma->vm_mm, uaddr, + vma->vm_page_prot, pages[i]); + if (ret) + return ret; + + uaddr += PMD_SIZE; + size -= PMD_SIZE; + i++; + } while (size > 0); + + return 0; +} + /* wrapper of __do_mmap() and the caller must hold mmap_write_lock(mm). */ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate, unsigned long prot) { + int ret = 0; unsigned long addr = spa->va_start; unsigned long size = spa_size(spa); unsigned long flags = MAP_FIXED_NOREPLACE | MAP_SHARED | MAP_POPULATE | @@ -1959,6 +2043,14 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, else vm_flags_clear(vma, VM_MAYWRITE); + if (sp_area_need_hugepage(spa)) { + ret = sp_vma_insert_hugepages(vma, spa->pages, addr, size); + if (ret) { + do_munmap(mm, addr, size, NULL); + return (unsigned long)ret; + } + } + return addr; } @@ -2156,7 +2248,7 @@ static int sp_map_spa_to_mm(struct mm_struct *mm, struct sp_area *spa, unsigned long prot, struct sp_alloc_context *ac, const char *str) { - int ret; + int ret = 0; unsigned long mmap_addr; unsigned long populate = 0; @@ -2179,6 +2271,13 @@ static int sp_map_spa_to_mm(struct mm_struct *mm, struct sp_area *spa, switch (spa->type) { case SPA_TYPE_ALLOC: mmap_write_unlock(mm); + /* + * If spa of SP_TYPE_ALLOC has unzero pages, we must have + * populated it in sp_mmap() before. So just break and don't + * pouplate it again. + */ + if (spa->pages) + break; ret = sp_alloc_populate(mm, spa, populate, ac); if (ret) { mmap_write_lock(mm); @@ -2234,6 +2333,10 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, struct sp_alloc_context * struct mm_struct *mm; bool reach_current = false; + if (sp_area_need_hugepage(spa) && + !sp_area_alloc_hugepages(spa, ac->preferred_node_id, ac->nodemask)) + return -ENOMEM; + mmap_ret = sp_map_spa_to_mm(current->mm, spa, spg_node->prot, ac, "sp_alloc"); if (mmap_ret) { /* Don't skip error for current process */ -- 2.43.0