From: Fang Lijun fanglijun3@huawei.com
ascend inclusion category: Bugfix bugzilla: NA CVE: NA
--------------
System cann't use the cdm nodes memory, but it can mmap all nodes huge pages, so it will cause Bus error when mmap succeed but the huge pages were not enough.
When set the cdmmask, users will transfer the numa id by mmap flag to map the specific numa node hugepages, if there was not enough hugepages on this node, return -ENOMEM.
v2: Fix compile error when disable CONFIG_COHERENT_DEVICE
Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/hugetlbfs/inode.c | 45 +++++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 4 ++++ include/linux/mm.h | 9 +++++++++ mm/hugetlb.c | 2 ++ mm/mmap.c | 13 ++++++++++++ 5 files changed, 73 insertions(+)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ef2c25b71736..2ac900f02280 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -120,6 +120,45 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); }
+/* + * Check current numa node has enough free huge pages to mmap hugetlb. + * resv_huge_pages_node: mmap hugepages but haven't used in current + * numa node. + */ +static int hugetlb_checknode(struct vm_area_struct *vma, long nr) +{ + int nid; + int ret = 0; + struct hstate *h = &default_hstate; + + spin_lock(&hugetlb_lock); + + nid = vma->vm_flags >> CHECKNODE_BITS; + + if (nid >= MAX_NUMNODES) { + ret = -EINVAL; + goto err; + } + + if (h->free_huge_pages_node[nid] < nr) { + ret = -ENOMEM; + goto err; + } else { + if (h->resv_huge_pages_node[nid] + nr > + h->free_huge_pages_node[nid]) { + ret = -ENOMEM; + goto err; + } else { + h->resv_huge_pages_node[nid] += nr; + ret = 0; + } + } + +err: + spin_unlock(&hugetlb_lock); + return ret; +} + /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -172,6 +211,12 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) inode_lock(inode); file_accessed(file);
+ if (is_set_cdmmask()) { + ret = hugetlb_checknode(vma, len >> huge_page_shift(h)); + if (ret < 0) + goto out; + } + ret = -ENOMEM; if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f9ac17a4d368..32f2837a6075 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -15,6 +15,9 @@ struct ctl_table; struct user_struct; struct mmu_gather;
+#define CHECKNODE_BITS 48 +#define CHECKNODE_MASK (~((_AC(1, UL) << CHECKNODE_BITS) - 1)) + #ifndef is_hugepd /* * Some architectures requires a hugepage directory format that is @@ -350,6 +353,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + unsigned int resv_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ struct cftype cgroup_files[5]; diff --git a/include/linux/mm.h b/include/linux/mm.h index b985af8ea7df..794d21255bfc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -70,6 +70,15 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif
+#ifdef CONFIG_COHERENT_DEVICE +static inline bool is_set_cdmmask(void) +{ + return !nodes_empty(cdmmask); +} +#else +#define is_set_cdmmask() (0) +#endif + #include <asm/page.h> #include <asm/pgtable.h> #include <asm/processor.h> diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2938b5bb7a49..0eb0c943397f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -969,6 +969,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetPagePrivate(page); h->resv_huge_pages--; + if (is_set_cdmmask()) + h->resv_huge_pages_node[vma->vm_flags >> CHECKNODE_BITS]--; }
mpol_cond_put(mpol); diff --git a/mm/mmap.c b/mm/mmap.c index c1034012aeaa..e1a4d3fa713e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -69,6 +69,7 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif
+static unsigned long numanode; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
@@ -1531,6 +1532,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; }
+ /* set numa node id into vm_flags, + * hugetlbfs file mmap will use it to check node + */ + if (is_set_cdmmask()) + vm_flags |= ((numanode << CHECKNODE_BITS) & CHECKNODE_MASK); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1546,6 +1553,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval;
+ /* get mmap numa node id */ + if (is_set_cdmmask()) { + numanode = (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK; + flags &= ~(MAP_HUGE_MASK << MAP_HUGE_SHIFT); + } + if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd);