From: Rik van Riel riel@surriel.com
next inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/IAO6NS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?...
--------------------------------
Take the end of a file write into consideration when deciding whether or not to use huge pages for tmpfs files when the tmpfs filesystem is mounted with huge=within_size
This allows large writes that append to the end of a file to automatically use large pages.
Doing 4MB sequential writes without fallocate to a 16GB tmpfs file with fio. The numbers without THP or with huge=always stay the same, but the performance with huge=within_size now matches that of huge=always.
huge before after 4kB pages 1560 MB/s 1560 MB/s within_size 1560 MB/s 4720 MB/s always: 4720 MB/s 4720 MB/s
[akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20240903111928.7171e60c@imladris.surriel.com Signed-off-by: Rik van Riel riel@surriel.com Reviewed-by: Baolin Wang baolin.wang@linux.alibaba.com Tested-by: Baolin Wang baolin.wang@linux.alibaba.com Cc: Darrick J. Wong djwong@kernel.org Cc: Hugh Dickins hughd@google.com Cc: Matthew Wilcox willy@infradead.org Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: fs/xfs/scrub/xfile.c fs/xfs/xfs_buf_mem.c mm/khugepaged.c mm/shmem.c [ Conflict in xfile.c,xfs_buf_mem.c and shmem.c because shmem_get_folio() has not been exported and used in xfs. Conflict in khugepaged.c because there are some page that have not been converted to folio. Context conflict with mm_in_dynamic_pool() in shmem.c ] Signed-off-by: Liu Shixin liushixin2@huawei.com --- include/linux/shmem_fs.h | 8 +++--- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 58 +++++++++++++++++++++------------------- mm/userfaultfd.c | 2 +- 5 files changed, 37 insertions(+), 35 deletions(-)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 57e8a6689439..0880504a781e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -117,11 +117,11 @@ int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool shmem_huge_force); + loff_t write_end, bool shmem_huge_force); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool shmem_huge_force) + loff_t write_end, bool shmem_huge_force) { return 0; } @@ -147,8 +147,8 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ };
-int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fbcfbd5fa914..ea560ea7b39e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -158,7 +158,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ if (!in_pf && shmem_file(vma->vm_file)) return shmem_allowable_huge_orders(file_inode(vma->vm_file), - vma, vma->vm_pgoff, + vma, vma->vm_pgoff, 0, !enforce_sysfs);
if (!vma_is_anonymous(vma)) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8006b13304de..c6379a2d55bc 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1900,7 +1900,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (xa_is_value(page) || !PageUptodate(page)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_get_folio(mapping->host, index, + if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; diff --git a/mm/shmem.c b/mm/shmem.c index 9ab915d5c060..700335e56e67 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -543,7 +543,8 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct vm_area_struct *vma, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, unsigned long vm_flags) { struct mm_struct *mm = vma ? vma->vm_mm : NULL; @@ -563,7 +564,8 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, return true; case SHMEM_HUGE_WITHIN_SIZE: index = round_up(index + 1, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= index) return true; fallthrough; @@ -577,14 +579,14 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, }
static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, unsigned long vm_flags) { if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false;
- return __shmem_huge_global_enabled(inode, index, shmem_huge_force, - vma, vm_flags); + return __shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); }
#if defined(CONFIG_SYSFS) @@ -769,8 +771,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, }
static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, unsigned long vm_flags) { return false; } @@ -976,7 +978,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; }
@@ -1161,7 +1163,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_huge_global_enabled(inode, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) { @@ -1650,7 +1652,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool shmem_huge_force) + loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); @@ -1667,8 +1669,8 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) return 0;
- global_huge = shmem_huge_global_enabled(inode, index, shmem_huge_force, - vma, vm_flags); + global_huge = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); if (!vma || !vma_is_anon_shmem(vma)) { /* * For tmpfs, we now only support PMD sized THP if huge page @@ -2112,8 +2114,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_fault *vmf, vm_fault_t *fault_type) + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; @@ -2193,7 +2195,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, }
/* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ - orders = shmem_allowable_huge_orders(inode, vma, index, false); + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (mm_in_dynamic_pool(vma ? vma->vm_mm : current->mm)) orders = 0; if (orders > 0) { @@ -2294,10 +2296,10 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return error; }
-int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) { - return shmem_get_folio_gfp(inode, index, foliop, sgp, + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); }
@@ -2391,7 +2393,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) }
WARN_ON_ONCE(vmf->page != NULL); - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); @@ -2876,7 +2878,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; }
- ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret;
@@ -2947,7 +2949,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; }
- error = shmem_get_folio(inode, index, &folio, SGP_READ); + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -3123,7 +3125,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (*ppos >= i_size_read(inode)) break;
- error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) @@ -3310,8 +3312,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -3663,7 +3665,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; @@ -3709,7 +3711,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) @@ -5168,7 +5170,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error;
- error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4ab24c56f660..8c22dd4e5e15 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -292,7 +292,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct page *page; int ret;
- ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT;