kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IDG4AX ------------------------------------------------- This patch contains preliminary support for user space NUMA replication. This is the first iteration, in the next revisions this patch will be split into multiple pieces. Acked-by: Ilya Hanov <ilya.hanov@huawei-partners.com> Acked-by: Denis Darvish <darvish.denis@huawei.com> Acked-by: Artem Kuzin <artem.kuzin@huawei.com> Co-developed-by: Gadeev Dmitry <gadeev.dmitry@h-partners.com> Signed-off-by: Gadeev Dmitry <gadeev.dmitry@h-partners.com> Co-developed-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Nikita Panov <panov.nikita@huawei.com> --- arch/arm64/include/asm/numa_replication.h | 3 + arch/arm64/mm/init.c | 2 +- arch/arm64/mm/pgd.c | 13 +- fs/exec.c | 18 + fs/proc/base.c | 76 + fs/proc/task_mmu.c | 111 +- include/asm-generic/pgalloc.h | 19 +- include/asm-generic/tlb.h | 22 + include/linux/cgroup.h | 1 + include/linux/gfp_types.h | 12 +- include/linux/memcontrol.h | 4 + include/linux/mm.h | 75 +- include/linux/mm_inline.h | 2 +- include/linux/mm_types.h | 49 +- include/linux/numa_kernel_replication.h | 185 ++- include/linux/numa_user_replication.h | 738 ++++++++++ include/linux/page-flags.h | 18 +- include/trace/events/mmflags.h | 10 +- include/uapi/asm-generic/mman-common.h | 2 + kernel/cgroup/cgroup.c | 2 +- kernel/events/uprobes.c | 5 +- kernel/fork.c | 41 + kernel/sched/fair.c | 8 +- mm/Kconfig | 13 + mm/Makefile | 1 + mm/gup.c | 3 +- mm/ksm.c | 15 +- mm/madvise.c | 18 +- mm/memcontrol.c | 137 +- mm/memory.c | 544 +++++-- mm/mempolicy.c | 5 + mm/migrate.c | 11 +- mm/migrate_device.c | 17 +- mm/mlock.c | 31 + mm/mmap.c | 32 + mm/mmu_gather.c | 55 +- mm/mprotect.c | 409 +++--- mm/mremap.c | 97 +- mm/numa_kernel_replication.c | 4 +- mm/numa_user_replication.c | 1577 +++++++++++++++++++++ mm/page_alloc.c | 8 +- mm/page_idle.c | 3 +- mm/page_vma_mapped.c | 3 +- mm/rmap.c | 41 +- mm/swap.c | 7 +- mm/swapfile.c | 3 +- mm/userfaultfd.c | 7 +- mm/userswap.c | 11 +- 48 files changed, 4049 insertions(+), 419 deletions(-) create mode 100644 include/linux/numa_user_replication.h create mode 100644 mm/numa_user_replication.c diff --git a/arch/arm64/include/asm/numa_replication.h b/arch/arm64/include/asm/numa_replication.h index 7b515c7d41981..d8647a89c5cc2 100644 --- a/arch/arm64/include/asm/numa_replication.h +++ b/arch/arm64/include/asm/numa_replication.h @@ -22,6 +22,9 @@ static inline pgd_t *numa_replicate_pgt_pgd(int nid) pgd_page = alloc_pages_node(nid, GFP_PGTABLE_KERNEL, 2); BUG_ON(pgd_page == NULL); + SetPageReplicated(pgd_page); + SetPageReplicated(pgd_page + 2); + new_pgd = (pgd_t *)page_address(pgd_page); new_pgd += (PAGE_SIZE * 2 / sizeof(pgd_t)); //Extra pages for KPTI copy_page(new_pgd, swapper_pg_dir); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 8d955787e030f..ef05ba6d6007d 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -606,7 +606,7 @@ void __init preallocate_vmalloc_pages(void) { unsigned long addr; - for (addr = MODULES_VADDR; addr <= VMALLOC_END; + for (addr = MODULES_VADDR; addr <= VMALLOC_END && addr != 0UL; addr = ALIGN(addr + 1, PGDIR_SIZE)) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 8326bd693b296..863b427e54ed8 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -41,20 +41,22 @@ pgd_t *page_pgd_alloc(struct mm_struct *mm) { int nid; gfp_t gfp = GFP_PGTABLE_USER | __GFP_THISNODE; + /* * Kernel replication is not supproted in case of non-page size pgd, * in general we can support it, but maybe later, due to we need to * update page tables allocation significantly, so, let's panic here. */ + for_each_memory_node(nid) { struct page *page; page = alloc_pages_node(nid, gfp, 0); if (!page) goto fail; - WARN_ON_ONCE(page_to_nid(page) != nid); + SetPageReplicated(page); *per_node_pgd_ptr(mm, nid) = (pgd_t *)page_address(page); } @@ -62,6 +64,7 @@ pgd_t *page_pgd_alloc(struct mm_struct *mm) *per_node_pgd_ptr(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid)); mm->pgd = per_node_pgd(mm, numa_get_memory_node(0));/*!!!*/ + build_pgd_chain(mm->pgd_numa); return mm->pgd; @@ -86,15 +89,23 @@ static pgd_t *pgd_alloc_replica(struct mm_struct *mm) static void page_pgd_free(struct mm_struct *mm, pgd_t *pgd) { int nid; + /* * Kernel replication is not supproted in case of non-page size pgd, * in general we can support it, but maybe later, due to we need to * update page tables allocation significantly, so, let's panic here. */ + + if (per_node_pgd(mm, first_memory_node) == NULL) + return; + + clear_pgtable_list(virt_to_ptdesc(per_node_pgd(mm, first_memory_node))); for_each_memory_node(nid) { if (per_node_pgd(mm, nid) == NULL) break; WARN_ON_ONCE(page_to_nid(virt_to_page(per_node_pgd(mm, nid))) != nid); + + ClearPageReplicated(virt_to_page(per_node_pgd(mm, nid))); free_page((unsigned long)per_node_pgd(mm, nid)); } diff --git a/fs/exec.c b/fs/exec.c index eaec57f79aa19..7eaa0f6322736 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,7 @@ #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> +#include <linux/numa_user_replication.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -276,6 +277,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); +#ifdef CONFIG_USER_REPLICATION + if (memcg_replication_enabled(mm)) { + __vm_flags_mod(vma, VM_REPLICA_INIT, VM_NONE); + if (vma_might_be_replicated(vma)) + __vm_flags_mod(vma, VM_REPLICA_COMMIT, VM_NONE); + } +#endif + err = insert_vm_struct(mm, vma); if (err) goto err; @@ -814,6 +823,15 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; +#ifdef CONFIG_USER_REPLICATION + if (memcg_replication_enabled(mm)) { + vm_flags |= VM_REPLICA_INIT; + if (vmflags_might_be_replicated(vm_flags)) + vm_flags |= VM_REPLICA_COMMIT; + else + vm_flags &= ~VM_REPLICA_COMMIT; + } +#endif vma_iter_init(&vmi, mm, vma->vm_start); diff --git a/fs/proc/base.c b/fs/proc/base.c index 276588a25225d..828373ba91042 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -81,6 +81,7 @@ #include <linux/audit.h> #include <linux/poll.h> #include <linux/nsproxy.h> +#include <linux/numa_user_replication.h> #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> @@ -3503,6 +3504,78 @@ static const struct file_operations proc_pid_xcall_operations = { }; #endif +#ifdef CONFIG_USER_REPLICATION + +static ssize_t numa_replication_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + char buffer[PROC_NUMBUF]; + size_t len; + int ret; + + if (!task) + return -ESRCH; + + ret = 0; + mm = get_task_mm(task); + if (mm) { + len = snprintf(buffer, sizeof(buffer), "%d\n", memcg_replication_enabled(mm)); + mmput(mm); + ret = simple_read_from_buffer(buf, count, ppos, buffer, len); + } + + put_task_struct(task); + + return ret; +} + +static ssize_t numa_replication_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + struct mm_struct *mm; + int val, ret; + + ret = kstrtoint_from_user(buf, count, 0, &val); + if (ret < 0) + return ret; + + if (!is_text_replicated()) + return -EINVAL; + + ret = -ESRCH; + task = get_proc_task(file_inode(file)); + if (!task) + goto out_no_task; + + mm = get_task_mm(task); + if (!mm) + goto out_no_mm; + ret = 0; + + if ((val != 0 && val != 1) || (val == 0 && memcg_replication_enabled(mm))) + ret = -EINVAL; + else if (val == 1) + numa_mm_handle_replication(mm, true, FORK_DISCARD_REPLICA); + + mmput(mm); +out_no_mm: + put_task_struct(task); +out_no_task: + if (ret < 0) + return ret; + return count; +} + +static const struct file_operations proc_numa_replication_operations = { + .read = numa_replication_read, + .write = numa_replication_write, +}; + +#endif + /* * Thread groups */ @@ -3548,6 +3621,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), +#endif +#ifdef CONFIG_USER_REPLICATION + REG("numa_replication", S_IRUSR|S_IWUSR, proc_numa_replication_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6d9b173a15450..0097f00281e2c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -20,6 +20,7 @@ #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <linux/numa_user_replication.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -406,6 +407,10 @@ struct mem_size_stats { u64 pss_dirty; u64 pss_locked; u64 swap_pss; +#ifdef CONFIG_USER_REPLICATION + KABI_EXTEND(unsigned long replicated); + KABI_EXTEND(u64 pss_repl); +#endif }; static void smaps_page_accumulate(struct mem_size_stats *mss, @@ -438,6 +443,20 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, } } +#ifdef CONFIG_USER_REPLICATION +static void smaps_page_accumulate_replicated(struct mem_size_stats *mss, + struct page *page, unsigned long size, unsigned long pss, bool private) +{ + mss->pss += pss; + mss->pss_repl += pss; + + if (private) + mss->private_clean += size; + else + mss->shared_clean += size; +} +#endif + static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool migration) @@ -493,6 +512,42 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, } } +#ifdef CONFIG_USER_REPLICATION +static void smaps_account_replicated(struct mem_size_stats *mss, + struct page **replica_pages, bool *numa_young) +{ + int nid; + int page_ref_count = 0; + unsigned long size = PAGE_SIZE; + unsigned long pss = PAGE_SIZE << PSS_SHIFT; + + for_each_memory_node(nid) { + mss->replicated += size; + mss->resident += size; + + if (numa_young[nid] || page_is_young(replica_pages[nid]) || + PageReferenced(replica_pages[nid])) + mss->referenced += size; + + if (!page_ref_count) { + page_ref_count = page_count(replica_pages[nid]); + pss /= page_ref_count; + } else + BUG_ON(page_ref_count != page_count(replica_pages[nid])); + + smaps_page_accumulate_replicated(mss, replica_pages[nid], size, pss, + (page_ref_count == 1)); + } + /* + * We account original page-instance as anonymous page, not replicated + */ + mss->replicated -= size; + mss->anonymous += size; + mss->pss_repl -= pss; + mss->pss_anon += pss; +} +#endif + #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) @@ -529,11 +584,48 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct page *page = NULL; bool migration = false, young = false, dirty = false; pte_t ptent = ptep_get(pte); +#ifdef CONFIG_USER_REPLICATION + bool numa_young[MAX_NUMNODES]; + struct page *replica_pages[MAX_NUMNODES]; +#endif if (pte_present(ptent)) { page = vm_normal_page(vma, addr, ptent); - young = pte_young(ptent); - dirty = pte_dirty(ptent); +#ifdef CONFIG_USER_REPLICATION + if (page && PageReplicated(page)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_pte; + bool start; + int nid; + + for_each_pgtable(curr, curr_pte, pte, nid, offset, start) { + pte_t curr_ptent = ptep_get(curr_pte); + BUG_ON(pte_dirty(curr_ptent)); + numa_young[nid] = pte_young(curr_ptent); + replica_pages[nid] = vm_normal_page(vma, addr, curr_ptent); + } + } else if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_pte; + + young = pte_young(ptent); + dirty = pte_dirty(ptent); + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + pte_t curr_ptent = ptep_get(curr_pte); + + young |= pte_young(curr_ptent); + dirty |= pte_dirty(curr_ptent); + if (young && dirty) + break; + } + } else +#endif + { + young = pte_young(ptent); + dirty = pte_dirty(ptent); + } } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); @@ -554,6 +646,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (is_migration_entry(swpent)) migration = true; page = pfn_swap_entry_to_page(swpent); + BUG_ON(page && PageReplicated(page)); } } else { smaps_pte_hole_lookup(addr, walk); @@ -563,7 +656,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, young, dirty, locked, migration); +#ifdef CONFIG_USER_REPLICATION + if (PageReplicated(page)) + smaps_account_replicated(mss, replica_pages, numa_young); + else +#endif + smaps_account(mss, page, false, young, dirty, locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -827,6 +925,10 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, mss->pss_file >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Shmem: ", mss->pss_shmem >> PSS_SHIFT); +#ifdef CONFIG_USER_REPLICATION + SEQ_PUT_DEC(" kb\nPss_Repl: ", + mss->pss_repl >> PSS_SHIFT); +#endif } SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); @@ -834,6 +936,9 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); +#ifdef CONFIG_USER_REPLICATION + SEQ_PUT_DEC(" kB\nReplicated: ", mss->replicated); +#endif SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index a8b7b343a4ed3..e6013e315fc65 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -80,17 +80,17 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) static inline pgtable_t __pte_alloc_one_node(unsigned int nid, struct mm_struct *mm, gfp_t gfp) { - struct page *pte; + struct ptdesc *ptdesc; - pte = alloc_pages_node(nid, gfp, 0); - if (!pte) + ptdesc = pagetable_alloc_node(nid, gfp, 0); + if (!ptdesc) return NULL; - if (!pagetable_pte_ctor(page_ptdesc(pte))) { - __free_page(pte); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - return pte; + return ptdesc_page(ptdesc); } #endif @@ -112,7 +112,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline pgtable_t pte_alloc_one_node(unsigned int nid, struct mm_struct *mm) { - return __pte_alloc_one_node(nid, mm, GFP_PGTABLE_USER | __GFP_THISNODE); + return __pte_alloc_one_node(nid, mm, GFP_PGTABLE_USER | __GFP_THISNODE | __GFP_MAYDIE); } #endif @@ -132,6 +132,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); + ClearPageReplicated(pte_page); pagetable_pte_dtor(ptdesc); pagetable_free(ptdesc); } @@ -179,7 +180,7 @@ static inline pmd_t *pmd_alloc_one_node(unsigned int nid, if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp |= __GFP_THISNODE; + gfp |= __GFP_THISNODE | __GFP_MAYDIE; ptdesc = pagetable_alloc_node(nid, gfp, 0); if (!ptdesc) @@ -234,7 +235,7 @@ static inline pud_t *__pud_alloc_one_node(unsigned int nid, if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp |= __GFP_THISNODE; + gfp |= __GFP_THISNODE | __GFP_MAYDIE; ptdesc = pagetable_alloc_node(nid, gfp, 0); if (!ptdesc) return NULL; diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 22384baee10e5..7063dbcbf02c9 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -247,7 +247,15 @@ static inline void tlb_remove_table_sync_one(void) { } * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ +#ifndef CONFIG_USER_REPLICATION #define MMU_GATHER_BUNDLE 8 +#else +#if MAX_NUMNODES < 8 +#define MMU_GATHER_BUNDLE 8 +#else +#define MMU_GATHER_BUNDLE MAX_NUMNODES +#endif +#endif struct mmu_gather_batch { struct mmu_gather_batch *next; @@ -269,6 +277,12 @@ struct mmu_gather_batch { extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); + +#ifdef CONFIG_USER_REPLICATION +extern bool __tlb_remove_replica_pages_size(struct mmu_gather *tlb, struct page **pages, + int page_size); +#endif + bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); @@ -482,6 +496,14 @@ static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE); } +#ifdef CONFIG_USER_REPLICATION +static __always_inline bool __tlb_remove_replica_pages(struct mmu_gather *tlb, + struct page **pages) +{ + return __tlb_remove_replica_pages_size(tlb, pages, PAGE_SIZE); +} +#endif + /* tlb_remove_page * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when * required. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 62cea15eb6df9..4a3d3023bfdb5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -137,6 +137,7 @@ int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); +bool cgroup_has_tasks(struct cgroup *cgrp); /* * Iteration helpers and macros. diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 435a5a8a301e0..7ca6d2f2fe7cf 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -59,8 +59,13 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN 0 #endif +#ifdef CONFIG_USER_REPLICATION +#define ___GFP_MAYDIE 0x4000000u +#else +#define ___GFP_MAYDIE 0 +#endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP 0x8000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -257,8 +262,11 @@ typedef unsigned int __bitwise gfp_t; /* Alloc memory from mirrored region */ #define __GFP_RELIABLE ((__force gfp_t)___GFP_RELIABLE) +/* May fallback to oom-killer when used with __GFP_THISNODE */ +#define __GFP_MAYDIE ((__force gfp_t)___GFP_MAYDIE) + /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index abe236201e68f..9a0e03e58f059 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -421,7 +421,11 @@ struct mem_cgroup { struct dynamic_pool *dpool; #endif +#ifdef CONFIG_USER_REPLICATION + KABI_USE(1, bool user_replication_active) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/linux/mm.h b/include/linux/mm.h index f706eed1a8b53..7580817f038b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -336,12 +336,16 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 39 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_7 40 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) +#define VM_HIGH_ARCH_7 BIT(VM_HIGH_ARCH_BIT_7) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -372,6 +376,21 @@ extern unsigned int kobjsize(const void *objp); # define VM_SHADOW_STACK VM_NONE #endif +#ifdef CONFIG_USER_REPLICATION +/* + * Page tables for this vma will be replicated during page faults + */ +# define VM_REPLICA_INIT VM_HIGH_ARCH_6 +/* + * Phys memory of this vma migth has replicas + * due to mprotect call or numa_balancer replication. + * Also, this flag is used by numa balancer as a hint, that this memory should + * be replicated. Obviously, that if this flag is set, + * VM_REPLICA_INIT also must be set. + */ +# define VM_REPLICA_COMMIT VM_HIGH_ARCH_7 +#endif /* CONFIG_USER_REPLICATION */ + #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) @@ -517,6 +536,20 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } +typedef enum { + /* Switch to default handling */ + REPLICA_NONE, + /* + * User replication stops here, + * already replicated levels need propagation + */ + REPLICA_PROPAGATE, + /* Keep replicating page tables */ + REPLICA_KEEP, + /* Failed to replicate page table level */ + REPLICA_FAIL, +} replica_action_t; + /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask @@ -571,9 +604,16 @@ struct vm_fault { * page table to avoid allocation from * atomic context. */ - KABI_RESERVE(1) - KABI_RESERVE(2) - KABI_RESERVE(3) + + KABI_EXTEND(unsigned long left_replicant) /* Closest vmas that require replicated tables */ + KABI_EXTEND(unsigned long right_replicant) + KABI_EXTEND(p4d_t *p4d) + KABI_EXTEND(pgd_t *pgd) + KABI_EXTEND(bool pte_replicated :1) + KABI_EXTEND(bool pmd_replicated :1) + KABI_EXTEND(bool pud_replicated :1) + KABI_EXTEND(bool p4d_replicated :1) + KABI_EXTEND(replica_action_t replica_action) /* last action performed with page table */ }; /* @@ -3023,6 +3063,7 @@ static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) return page_ptdesc(page); } +#ifdef CONFIG_KERNEL_REPLICATION static inline struct ptdesc *pagetable_alloc_node(int nid, gfp_t gfp, unsigned int order) { @@ -3030,6 +3071,7 @@ static inline struct ptdesc *pagetable_alloc_node(int nid, gfp_t gfp, return page_ptdesc(page); } +#endif /** * pagetable_free - Free pagetables @@ -3051,15 +3093,40 @@ void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); void ptlock_free(struct ptdesc *ptdesc); +#ifdef CONFIG_KERNEL_REPLICATION +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) +{ + return ptdesc->master_table->ptl; +} +#else static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return ptdesc->ptl; } +#endif + #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } +#ifdef CONFIG_KERNEL_REPLICATION +static inline bool ptlock_alloc(struct ptdesc *ptdesc) +{ + ptdesc->master_table = ptdesc; + return true; +} + +static inline void ptlock_free(struct ptdesc *ptdesc) +{ + ptdesc->master_table = NULL; +} + +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) +{ + return &ptdesc->master_table->ptl; +} +#else static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; @@ -3073,6 +3140,8 @@ static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return &ptdesc->ptl; } +#endif + #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 386e1eaac1fa4..82bab18e45c14 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -598,7 +598,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) - set_pte_at(vma->vm_mm, addr, pte, + set_pte_at_replicated(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); #endif } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a0015a55211c..cb8737f18e6e5 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -461,12 +461,41 @@ struct ptdesc { pgtable_t pmd_huge_pte; }; }; +#ifdef CONFIG_KERNEL_REPLICATION + KABI_REPLACE( + unsigned long __page_mapping, + union { + unsigned long __page_mapping; + struct llist_head replica_list_head; /* required for connecting */ + struct llist_node replica_list_node; /* replicated tables into lists */ + } + ) + /* + * master_table is used only for pte and pmd levels, + * If we have, for example, 4 replicated pmd tables, + * we need to use single lock to correctly serialize modifications of this level. + * So, this field points to lock from original table. + * If tables are not replicated, or table is master, master_lock + * equals to ptl (or &ptl). + */ + KABI_REPLACE( + union { + struct mm_struct *pt_mm; + atomic_t pt_frag_refcount; + }, + union { + struct mm_struct *pt_mm; + atomic_t pt_frag_refcount; + struct ptdesc *master_table; + } + ) +#else unsigned long __page_mapping; - union { struct mm_struct *pt_mm; atomic_t pt_frag_refcount; }; +#endif union { unsigned long _pt_pad_2; @@ -716,6 +745,18 @@ struct mm_cid { int cid; }; +typedef enum { + /* Used only in not yet initialized mm-instances. + * It is matter for it to be zero. */ + FORK_NO_REPLICA = 0, + /* Used when user-replication shoud not be inherited during forks + * due to disabled user-replication in related mem cgroup. */ + FORK_DISCARD_REPLICA, + /* Used when user-replication should be inherited during forks + * due to enabled user-replication in related mem cgroup. */ + FORK_KEEP_REPLICA +} fork_policy_t; + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -986,7 +1027,13 @@ struct mm_struct { #else KABI_RESERVE(1) #endif + +#ifdef CONFIG_USER_REPLICATION + KABI_USE2(2, bool cg_user_replication_active, fork_policy_t fork_policy) +#else KABI_RESERVE(2) +#endif + KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) diff --git a/include/linux/numa_kernel_replication.h b/include/linux/numa_kernel_replication.h index 37e7b56b5aa94..9607a6162dfc2 100644 --- a/include/linux/numa_kernel_replication.h +++ b/include/linux/numa_kernel_replication.h @@ -2,8 +2,6 @@ #ifndef _LINUX_NUMA_REPLICATION_H #define _LINUX_NUMA_REPLICATION_H -#ifdef CONFIG_KERNEL_REPLICATION - #include <linux/kabi.h> /* @@ -20,6 +18,7 @@ #include KABI_HIDE_INCLUDE(<linux/nodemask.h>) #include KABI_HIDE_INCLUDE(<linux/module.h>) #include KABI_HIDE_INCLUDE(<linux/mm.h>) +#include KABI_HIDE_INCLUDE(<linux/llist.h>) #include KABI_HIDE_INCLUDE(<asm/numa_replication.h>) #if defined(tmp_linux_value) @@ -42,6 +41,8 @@ extern nodemask_t replica_nodes; nid != MAX_NUMNODES; \ nid = next_node(nid, replica_nodes)) +#ifdef CONFIG_KERNEL_REPLICATION + bool is_text_replicated(void); static inline pgd_t *this_node_pgd(struct mm_struct *mm) @@ -74,6 +75,48 @@ static inline bool numa_addr_has_replica(const void *addr) ((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT); } +static inline void clear_pgtable_list(struct ptdesc *head) +{ + struct llist_node *node; + + /* Replica list already have been destroyed */ + if (head->replica_list_node.next == NULL) + return; + + for (node = llist_del_first(&head->replica_list_head); + node != &head->replica_list_node; + node = llist_del_first(&head->replica_list_head)) + node->next = NULL; + head->replica_list_node.next = NULL; +} + +static inline void build_pgd_chain(pgd_t **tables) +{ + int nid; + int prev_node = -1; + for_each_memory_node(nid) { + virt_to_ptdesc(tables[nid])->replica_list_head.first = NULL; + if (prev_node != -1) { + llist_add( + &virt_to_ptdesc(tables[nid])->replica_list_node, &virt_to_ptdesc(tables[prev_node])->replica_list_head); + } else { + /* + * This list is not supposed to be circular, + * but in order to simplify macro implementation, + * we do it anyway. + * God help us + */ + virt_to_ptdesc(tables[nid])->replica_list_node.next = &virt_to_ptdesc(tables[nid])->replica_list_node; + } + prev_node = nid; + } +} + +static inline bool numa_pgtable_replicated(void *table) +{ + return PageReplicated(virt_to_page(table)); +} + void __init numa_replication_init(void); void __init numa_replicate_kernel_text(void); void numa_replicate_kernel_rodata(void); @@ -86,8 +129,63 @@ int numa_get_memory_node(int nid); void dump_mm_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline unsigned long offset_in_table(void *ptr) +{ + return (unsigned long)ptr & (~PAGE_MASK); +} + +static inline unsigned long get_table_ptr(struct ptdesc *table, unsigned long offset) +{ + return ((unsigned long)ptdesc_to_virt(table) + offset); +} + +/** + * @pos: struct ptdesc* of current replica + * @table: current table entry to write (virtaul address) + * @head_table: table entry from 0th node, will be a part of this loop + * @nid: node id of current pgtable + * @offset: offset of current table entry in table page in bytes [0 .. 4088] + * @start: boolean value for tmp storage + */ +#define for_each_pgtable(pos, table, head_table, nid, offset, start) \ + for (pos = llist_entry(&virt_to_ptdesc(head_table)->replica_list_node, typeof(*pos), replica_list_node), \ + start = true, nid = page_to_nid(ptdesc_page(pos)), \ + offset = offset_in_table(head_table), table = (typeof(table))get_table_ptr(pos, offset); \ + pos != virt_to_ptdesc(head_table) || start; \ + pos = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + table = (typeof(table))get_table_ptr(pos, offset), \ + nid = page_to_nid(ptdesc_page(pos)), start = false) + +/** + * @pos: struct ptdesc* of current replica + * @table: current table entry to write (virtaul address) + * @head_table: table entry from 0th node, will not be a part of this loop + * @offset: offset of current table entry in table page in bytes [0 .. 4088] + */ +#define for_each_pgtable_replica(pos, table, head_table, offset) \ + for (pos = llist_entry(virt_to_ptdesc(head_table)->replica_list_node.next, typeof(*pos), replica_list_node), \ + offset = offset_in_table(head_table), table = (typeof(table))get_table_ptr(pos, offset); \ + pos != virt_to_ptdesc(head_table); \ + pos = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + table = (typeof(table))get_table_ptr(pos, offset)) + +/** Safe against removal pos + * @pos: struct ptdesc* of current replica + * @n: tmp storage + * @table: current table entry to write (virtaul address) + * @head_table: table entry from 0th node, will not be a part of this loop + * @offset: offset of current table entry in table page in bytes [0 .. 4088] + */ +#define for_each_pgtable_replica_safe(pos, n, table, head_table, offset) \ + for (pos = llist_entry(virt_to_ptdesc(head_table)->replica_list_node.next, typeof(*pos), replica_list_node), \ + n = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + offset = offset_in_table(head_table), table = (typeof(table))get_table_ptr(pos, offset); \ + pos != virt_to_ptdesc(head_table); \ + pos = n, n = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + table = (typeof(table))get_table_ptr(pos, offset)) + /* Macro to walk over mm->pgd_numa and cast it to appropriate level type */ -#define for_each_pgtable_replica(table, mm, replica, nid, offset) \ +#define for_each_pgtable_kernel_replica(table, mm, replica, nid, offset) \ for (nid = first_node(replica_nodes), offset = ((unsigned long)table) & (~PAGE_MASK), \ replica = (typeof(table))(((unsigned long)mm->pgd_numa[nid]) + offset); \ nid != MAX_NUMNODES; \ @@ -96,81 +194,82 @@ void dump_mm_pgtables(struct mm_struct *mm, static inline void pgd_populate_replicated(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) { - int nid; - pgd_t *curr_pgd; - unsigned long offset; + pgd_populate(mm, pgdp, p4dp); + + if (!is_text_replicated()) + return; - if (get_propagation_level() == PGD_PROPAGATION) { - for_each_pgtable_replica(pgdp, mm, curr_pgd, nid, offset) { + if (numa_pgtable_replicated(pgdp)) { + unsigned long offset; + struct ptdesc *curr; + pgd_t *curr_pgd; + for_each_pgtable_replica(curr, curr_pgd, pgdp, offset) { pgd_populate(mm, curr_pgd, p4dp); } - } else { - pgd_populate(mm, pgdp, p4dp); } } static inline void p4d_populate_replicated(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { - int nid; - p4d_t *curr_p4d; - unsigned long offset; + p4d_populate(mm, p4dp, pudp); + + if (!is_text_replicated()) + return; - if (get_propagation_level() == P4D_PROPAGATION) { - for_each_pgtable_replica(p4dp, mm, curr_p4d, nid, offset) { + if (numa_pgtable_replicated(p4dp)) { + unsigned long offset; + struct ptdesc *curr; + p4d_t *curr_p4d; + for_each_pgtable_replica(curr, curr_p4d, p4dp, offset) { p4d_populate(mm, curr_p4d, pudp); } - } else { - p4d_populate(mm, p4dp, pudp); } } static inline void pud_populate_replicated(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) { - int nid; - pud_t *curr_pud; - unsigned long offset; + pud_populate(mm, pudp, pmdp); - if (get_propagation_level() == PUD_PROPAGATION) { - for_each_pgtable_replica(pudp, mm, curr_pud, nid, offset) { + if (!is_text_replicated()) + return; + + if (numa_pgtable_replicated(pudp)) { + unsigned long offset; + struct ptdesc *curr; + pud_t *curr_pud; + for_each_pgtable_replica(curr, curr_pud, pudp, offset) { pud_populate(mm, curr_pud, pmdp); } - } else { - pud_populate(mm, pudp, pmdp); } } static inline void pmd_populate_replicated(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { - int nid; - pmd_t *curr_pmd; - unsigned long offset; + pmd_populate(mm, pmdp, ptep); + + if (!is_text_replicated()) + return; - if (get_propagation_level() == PMD_PROPAGATION) { - for_each_pgtable_replica(pmdp, mm, curr_pmd, nid, offset) { + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct ptdesc *curr; + pmd_t *curr_pmd; + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { pmd_populate(mm, curr_pmd, ptep); } - } else { - pmd_populate(mm, pmdp, ptep); } } #else -#if defined(linux) -#define tmp_linux_value linux -#undef linux -#endif - -#include KABI_HIDE_INCLUDE(<linux/mm.h>) - -#if defined(tmp_linux_value) -#define linux tmp_linux_value -#undef tmp_linux_value -#endif - #define this_node_pgd(mm) ((mm)->pgd) #define per_node_pgd(mm, nid) ((mm)->pgd) +static inline bool numa_pgtable_replicated(void *table) +{ + return false; +} + static inline void numa_setup_pgd(void) { } diff --git a/include/linux/numa_user_replication.h b/include/linux/numa_user_replication.h new file mode 100644 index 0000000000000..74653b7f7da40 --- /dev/null +++ b/include/linux/numa_user_replication.h @@ -0,0 +1,738 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_NUMA_USER_REPLICATION_H +#define _LINUX_NUMA_USER_REPLICATION_H + +#include <linux/kabi.h> +#include <linux/mm_inline.h> +#include <linux/numa_kernel_replication.h> + +/* Same as in numa_kernel_replication.h */ +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE(<linux/mempolicy.h>) + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + +#ifdef CONFIG_USER_REPLICATION + +struct pgtable_private { + pte_t *pte_numa[MAX_NUMNODES]; + struct page *replica_pages[MAX_NUMNODES]; + bool pte_replicated; +}; + +static inline void pgtable_pte_step(struct pgtable_private *zp, int nr) +{ + int nid; + if (zp->pte_replicated) + for_each_memory_node(nid) + zp->pte_numa[nid] += nr; +} + +static inline void pgtable_update_pte(struct pgtable_private *zp, pte_t *pte) +{ + zp->pte_numa[page_to_nid(virt_to_page(pte))] = pte; + zp->pte_replicated = false; + + if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_pte; + int nid; + zp->pte_replicated = true; + + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + nid = page_to_nid(ptdesc_page(curr)); + zp->pte_numa[nid] = curr_pte; + } + } +} + +static inline void set_master_page_for_puds(int allocated_node, pud_t **new) +{ + int nid; + struct ptdesc *master_table; + struct ptdesc *curr_table; + + if (allocated_node == NUMA_NO_NODE) + allocated_node = first_memory_node; + + master_table = virt_to_ptdesc(new[allocated_node]); + + for_each_memory_node(nid) { + curr_table = virt_to_ptdesc(new[nid]); + curr_table->master_table = master_table; + } +} + +static inline void set_master_page_for_pmds(int allocated_node, pmd_t **new) +{ + int nid; + struct ptdesc *master_table; + struct ptdesc *curr_table; + + if (allocated_node == NUMA_NO_NODE) + allocated_node = first_memory_node; + + master_table = virt_to_ptdesc(new[allocated_node]); + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + curr_table = virt_to_ptdesc(new[nid]); + curr_table->master_table = master_table; + } +} + +static inline void set_master_page_for_ptes(int allocated_node, struct ptdesc **new) +{ + int nid; + struct ptdesc *master_table; + struct ptdesc *curr_table; + + if (allocated_node == NUMA_NO_NODE) + allocated_node = first_memory_node; + + master_table = new[allocated_node]; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + curr_table = new[nid]; + curr_table->master_table = master_table; + } +} + +void numa_mm_handle_replication(struct mm_struct *mm, bool enable, fork_policy_t fork_policy); +int phys_duplicate(struct vm_area_struct *vma, unsigned long start, size_t len); +int phys_deduplicate(struct vm_area_struct *vma, unsigned long start, size_t len, bool alloc_new_page); +unsigned long phys_duplicate_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long end); + +static inline int numa_is_vma_replicant(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_REPLICA_INIT) + return 1; + return 0; +} + +static inline bool vma_has_replicas(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_REPLICA_COMMIT; +} + +static inline bool vmflags_might_be_replicated(vm_flags_t vm_flags) +{ + return (vm_flags & VM_REPLICA_INIT) && (vm_flags & VM_ACCESS_FLAGS) && + !(vm_flags & (VM_WRITE | VM_SHARED | VM_LOCKED_MASK)); +} + +static inline bool vma_might_be_replicated(struct vm_area_struct *vma) +{ + return (vma->vm_file || vma_is_anonymous(vma)) && + vmflags_might_be_replicated(vma->vm_flags); +} + +static inline bool memcg_replication_enabled(struct mm_struct *mm) +{ + return mm->cg_user_replication_active; +} + +/* + * Arch specific implementation + */ +#if defined(CONFIG_ARM64) && !defined(CONFIG_ARM64_CONTPTE) + +static inline void set_ptes_replicated(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + pte_t *ptep_numa[MAX_NUMNODES]; + unsigned long offset; + bool start; + struct ptdesc *curr; + pte_t *curr_ptep; + int nid; + + if (!numa_pgtable_replicated(ptep)) + return set_ptes(mm, addr, ptep, pte, nr); + + for_each_pgtable(curr, curr_ptep, ptep, nid, offset, start) { + ptep_numa[nid] = curr_ptep; + } + + for_each_memory_node(nid) + page_table_check_ptes_set(mm, ptep_numa[nid], pte, nr); + __sync_cache_and_tags(pte, nr); + + for (;;) { + for_each_memory_node(nid) + __check_safe_pte_update(mm, ptep_numa[nid], pte); + + for_each_memory_node(nid) + WRITE_ONCE(*ptep_numa[nid], pte); + if (pte_valid_not_user(pte)) { + dsb(ishst); + isb(); + } + + if (--nr == 0) + break; + for_each_memory_node(nid) + ptep_numa[nid]++; + pte = pte_advance_pfn(pte, 1); + } +} +#define set_pte_at_replicated(mm, addr, ptep, pte) set_ptes_replicated(mm, addr, ptep, pte, 1) + +#ifdef CONFIG_MMU_NOTIFIER +#define set_pte_at_notify_replicated(__mm, __address, __ptep, __pte) \ +({ \ + struct mm_struct *___mm = __mm; \ + unsigned long ___address = __address; \ + pte_t ___pte = __pte; \ + \ + mmu_notifier_change_pte(___mm, ___address, ___pte); \ + set_pte_at_replicated(___mm, ___address, __ptep, ___pte); \ +}) +#else +#define set_pte_at_notify_replicated set_pte_at_replicated +#endif + +static inline void wrprotect_ptes_replicated(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + wrprotect_ptes(mm, addr, ptep, nr); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_ptep; + for_each_pgtable_replica(curr, curr_ptep, ptep, offset) { + wrprotect_ptes(mm, addr, curr_ptep, nr); + } + } +} + +static inline void pte_clear_replicated(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = __pte(0); + + WRITE_ONCE(*ptep, pte); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_ptep; + for_each_pgtable_replica(curr, curr_ptep, ptep, offset) + WRITE_ONCE(*curr_ptep, pte); + } + + /* + * Only if the new pte is valid and kernel, otherwise TLB maintenance + * or update_mmu_cache() have the necessary barriers. + */ + if (pte_valid_not_user(pte)) { + dsb(ishst); + isb(); + } +} + +static inline void pmd_clear_replicated(pmd_t *pmdp) +{ + pmd_t pmd = __pmd(0); + +#ifdef __PAGETABLE_PMD_FOLDED + if (in_swapper_pgdir(pmdp)) { + BUG_ON(numa_pgtable_replicated(pmdp)); + set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd))); + return; + } +#endif /* __PAGETABLE_PMD_FOLDED */ + + WRITE_ONCE(*pmdp, pmd); + + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct ptdesc *curr; + pmd_t *curr_pmdp; + for_each_pgtable_replica(curr, curr_pmdp, pmdp, offset) + WRITE_ONCE(*curr_pmdp, pmd); + } + + if (pmd_valid(pmd)) { + dsb(ishst); + isb(); + } +} + +static inline void pud_clear_replicated(pud_t *pudp) +{ + pud_t pud = __pud(0); + +#ifdef __PAGETABLE_PUD_FOLDED + if (in_swapper_pgdir(pudp)) { + BUG_ON(numa_pgtable_replicated(pudp)); + set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud))); + return; + } +#endif /* __PAGETABLE_PUD_FOLDED */ + + WRITE_ONCE(*pudp, pud); + + if (numa_pgtable_replicated(pudp)) { + unsigned long offset; + struct ptdesc *curr; + pud_t *curr_pudp; + for_each_pgtable_replica(curr, curr_pudp, pudp, offset) + WRITE_ONCE(*curr_pudp, pud); + } + + if (pud_valid(pud)) { + dsb(ishst); + isb(); + } +} + +static inline void pte_clear_not_present_full_replicated(struct mm_struct *mm, + unsigned long address, pte_t *ptep, int full) +{ + pte_clear_replicated(mm, address, ptep); +} + +static inline void clear_not_present_full_ptes_replicated(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + pte_clear_not_present_full_replicated(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} + +static inline pte_t ptep_get_and_clear_replicated(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_ptep; + for_each_pgtable_replica(curr, curr_ptep, ptep, offset) { + pte_t curr_pte = ptep_get_and_clear(mm, addr, curr_ptep); + + if (pte_dirty(curr_pte)) + pte = pte_mkdirty(pte); + if (pte_young(curr_pte)) + pte = pte_mkyoung(pte); + } + } + + return pte; +} + +static inline pte_t ptep_clear_flush_replicated(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + struct mm_struct *mm = (vma)->vm_mm; + pte_t pte; + pte = ptep_get_and_clear_replicated(mm, address, ptep); + if (pte_accessible(mm, pte)) + flush_tlb_page(vma, address); + return pte; +} + +static inline pte_t ptep_modify_prot_start_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + if (IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198) && + cpus_have_const_cap(ARM64_WORKAROUND_2645198)) { + /* + * Break-before-make (BBM) is required for all user space mappings + * when the permission changes from executable to non-executable + * in cases where cpu is affected with errata #2645198. + */ + if (pte_user_exec(ptep_get(ptep))) + return ptep_clear_flush_replicated(vma, addr, ptep); + } + return ptep_get_and_clear_replicated(vma->vm_mm, addr, ptep); +} + +static inline void ptep_modify_prot_commit_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) +{ + set_pte_at_replicated(vma->vm_mm, addr, ptep, pte); +} + +static inline int ptep_test_and_clear_young_replicated(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int ret = ptep_test_and_clear_young(vma, address, ptep); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_ptep; + for_each_pgtable_replica(curr, curr_ptep, ptep, offset) { + ret |= ptep_test_and_clear_young(vma, address, curr_ptep); + } + } + + return ret; +} + +static inline int ptep_clear_flush_young_replicated(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young = ptep_test_and_clear_young_replicated(vma, address, ptep); + + if (young) { + /* + * We can elide the trailing DSB here since the worst that can + * happen is that a CPU continues to use the young entry in its + * TLB and we mistakenly reclaim the associated page. The + * window for such an event is bounded by the next + * context-switch, which provides a DSB to complete the TLB + * invalidation. + */ + flush_tlb_page_nosync(vma, address); + } + + return young; +} + +#ifdef CONFIG_MMU_NOTIFIER +#define ptep_clear_young_notify_replicated(__vma, __address, __ptep) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = ptep_test_and_clear_young_replicated(___vma, ___address, __ptep);\ + __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ + ___address + PAGE_SIZE); \ + __young; \ +}) + +#define ptep_clear_flush_young_notify_replicated(__vma, __address, __ptep) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = ptep_clear_flush_young_replicated(___vma, ___address, __ptep); \ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address, \ + ___address + \ + PAGE_SIZE); \ + __young; \ +}) +#else +#define ptep_clear_young_notify_replicated ptep_test_and_clear_young_replicated +#define ptep_clear_flush_young_notify_replicated ptep_clear_flush_young_replicated +#endif + +static inline int __ptep_set_access_flags_no_flush(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t entry) +{ + pteval_t old_pteval, pteval; + pte_t pte = __ptep_get(ptep); + + if (pte_same(pte, entry)) + return 0; + + /* only preserve the access flags and write permission */ + pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; + + /* + * Setting the flags must be done atomically to avoid racing with the + * hardware update of the access/dirty state. The PTE_RDONLY bit must + * be set to the most permissive (lowest value) of *ptep and entry + * (calculated as: a & b == ~(~a | ~b)). + */ + pte_val(entry) ^= PTE_RDONLY; + pteval = pte_val(pte); + do { + old_pteval = pteval; + pteval ^= PTE_RDONLY; + pteval |= pte_val(entry); + pteval ^= PTE_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); + } while (pteval != old_pteval); + + return 1; +} + +static inline int ptep_set_access_flags_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t entry, int dirty) +{ + int ret = __ptep_set_access_flags_no_flush(vma, addr, ptep, entry); + + if (numa_pgtable_replicated(ptep)) { + pgprot_t prot = pte_pgprot(entry); + + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_ptep; + for_each_pgtable_replica(curr, curr_ptep, ptep, offset) { + WARN_ON(!pte_present(__ptep_get(curr_ptep))); + + entry = pfn_pte(pte_pfn(__ptep_get(curr_ptep)), prot); + ret |= __ptep_set_access_flags_no_flush(vma, addr, curr_ptep, entry); + } + } + + /* Invalidate a stale read-only entry */ + if (dirty) + flush_tlb_page(vma, addr); + + return ret; +} + +static inline void clear_young_dirty_ptes_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) +{ + clear_young_dirty_ptes(vma, addr, ptep, nr, flags); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_ptep; + for_each_pgtable_replica(curr, curr_ptep, ptep, offset) + clear_young_dirty_ptes(vma, addr, curr_ptep, nr, flags); + } +} + +#endif + +static inline void build_pte_chain(struct ptdesc **tables) +{ + int nid; + int prev_node = -1; + for_each_memory_node(nid) { + tables[nid]->replica_list_head.first = NULL; + if (prev_node != -1) { + llist_add(&tables[nid]->replica_list_node, &tables[prev_node]->replica_list_head); + } else { + tables[nid]->replica_list_node.next = &tables[nid]->replica_list_node; + } + prev_node = nid; + } +} + +static inline void build_pmd_chain(pmd_t **tables) +{ + int nid; + int prev_node = -1; + for_each_memory_node(nid) { + struct ptdesc *curr_ptdesc = virt_to_ptdesc(tables[nid]); + + curr_ptdesc->replica_list_head.first = NULL; + if (prev_node != -1) { + struct ptdesc *prev_ptdesc = virt_to_ptdesc(tables[prev_node]); + llist_add(&curr_ptdesc->replica_list_node, &prev_ptdesc->replica_list_head); + } else { + curr_ptdesc->replica_list_node.next = &curr_ptdesc->replica_list_node; + } + prev_node = nid; + } +} + +static inline void build_pud_chain(pud_t **tables) +{ + int nid; + int prev_node = -1; + for_each_memory_node(nid) { + struct ptdesc *curr_ptdesc = virt_to_ptdesc(tables[nid]); + + curr_ptdesc->replica_list_head.first = NULL; + if (prev_node != -1) { + struct ptdesc *prev_ptdesc = virt_to_ptdesc(tables[prev_node]); + llist_add(&curr_ptdesc->replica_list_node, &prev_ptdesc->replica_list_head); + } else { + curr_ptdesc->replica_list_node.next = &curr_ptdesc->replica_list_node; + } + prev_node = nid; + } +} + +static inline void build_p4d_chain(p4d_t **tables) +{ + int nid; + int prev_node = -1; + for_each_memory_node(nid) { + struct ptdesc *curr_ptdesc = virt_to_ptdesc(tables[nid]); + + curr_ptdesc->replica_list_head.first = NULL; + if (prev_node != -1) { + struct ptdesc *prev_ptdesc = virt_to_ptdesc(tables[prev_node]); + llist_add(&curr_ptdesc->replica_list_node, &prev_ptdesc->replica_list_head); + } else { + curr_ptdesc->replica_list_node.next = &curr_ptdesc->replica_list_node; + } + prev_node = nid; + } +} + +pgd_t *fault_pgd_offset(struct vm_fault *vmf, unsigned long address); +p4d_t *fault_p4d_alloc(struct vm_fault *vmf, struct mm_struct *mm, pgd_t *pgd, unsigned long address); +pud_t *fault_pud_alloc(struct vm_fault *vmf, struct mm_struct *mm, p4d_t *p4d, unsigned long address); +pmd_t *fault_pmd_alloc(struct vm_fault *vmf, struct mm_struct *mm, pud_t *pud, unsigned long address); +int fault_pte_alloc(struct vm_fault *vmf); + +pte_t *cpr_alloc_pte_map_lock(struct mm_struct *dst_mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd, spinlock_t **ptl); +pmd_t *cpr_alloc_pmd(struct mm_struct *dst_mm, unsigned long addr, + pud_t *src_pud, pud_t *dst_pud); +pud_t *cpr_alloc_pud(struct mm_struct *dst_mm, unsigned long addr, + p4d_t *src_p4d, p4d_t *dst_p4d); +p4d_t *cpr_alloc_p4d(struct mm_struct *dst_mm, unsigned long addr, + pgd_t *src_pgd, pgd_t *dst_pgd); + +static inline void cleanup_pte_list(pgtable_t table) +{ + page_ptdesc(table)->replica_list_node.next = NULL; +} + +static inline void cleanup_pmd_list(struct ptdesc *table) +{ +#ifndef __PAGETABLE_PMD_FOLDED + table->replica_list_node.next = NULL; +#endif +} + +static inline void cleanup_pud_list(struct ptdesc *table) +{ +#ifndef __PAGETABLE_PUD_FOLDED + table->replica_list_node.next = NULL; +#endif +} + +static inline void cleanup_p4d_list(struct ptdesc *table) +{ +#ifndef __PAGETABLE_P4D_FOLDED + table->replica_list_node.next = NULL; +#endif +} + +#else /* !CONFIG_USER_REPLICATION */ + +struct pgtable_private { + pte_t **pte_numa; + struct page **replica_pages; + bool pte_replicated; +}; + +static inline void pgtable_pte_step(struct pgtable_private *zp, int nr) { } +static inline void pgtable_update_pte(struct pgtable_private *zp, pte_t *pte) +{ + zp->pte_replicated = false; +} + +static inline int numa_is_vma_replicant(struct vm_area_struct *vma) +{ + return 0; +} + +static inline bool vma_has_replicas(struct vm_area_struct *vma) +{ + return 0; +} + +static inline bool vmflags_might_be_replicated(vm_flags_t vm_flags) +{ + return 0; +} + +static inline bool vma_might_be_replicated(struct vm_area_struct *vma) +{ + return 0; +} + +static inline bool memcg_replication_enabled(struct mm_struct *mm) +{ + return 0; +} + +static inline pgd_t *fault_pgd_offset(struct vm_fault *vmf, unsigned long address) +{ + return pgd_offset(vmf->vma->vm_mm, address); +} + +static inline p4d_t *fault_p4d_alloc(struct vm_fault *vmf, struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + return p4d_alloc(mm, pgd, address); +} + +static inline pud_t *fault_pud_alloc(struct vm_fault *vmf, struct mm_struct *mm, p4d_t *p4d, unsigned long address) +{ + return pud_alloc(mm, p4d, address); +} + +static inline pmd_t *fault_pmd_alloc(struct vm_fault *vmf, struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + return pmd_alloc(mm, pud, address); +} + +static inline int fault_pte_alloc(struct vm_fault *vmf) +{ + return 0; +} + +static inline pte_t *cpr_alloc_pte_map_lock(struct mm_struct *dst_mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd, spinlock_t **ptl) +{ + return pte_alloc_map_lock(dst_mm, dst_pmd, addr, ptl); +} + +static inline pmd_t *cpr_alloc_pmd(struct mm_struct *dst_mm, unsigned long addr, + pud_t *src_pud, pud_t *dst_pud) +{ + return pmd_alloc(dst_mm, dst_pud, addr); +} + +static inline pud_t *cpr_alloc_pud(struct mm_struct *dst_mm, unsigned long addr, + p4d_t *src_p4d, p4d_t *dst_p4d) +{ + return pud_alloc(dst_mm, dst_p4d, addr); +} + +static inline p4d_t *cpr_alloc_p4d(struct mm_struct *dst_mm, unsigned long addr, + pgd_t *src_pgd, pgd_t *dst_pgd) +{ + return p4d_alloc(dst_mm, dst_pgd, addr); +} + +#define set_ptes_replicated set_ptes +#define set_pte_at_replicated set_pte_at +#define set_pte_at_notify_replicated set_pte_at_notify + +#define wrprotect_ptes_replicated wrprotect_ptes + +#define pte_clear_replicated pte_clear +#define pmd_clear_replicated pmd_clear +#define pud_clear_replicated pud_clear + +#define pte_clear_not_present_full_replicated pte_clear_not_present_full +#define clear_not_present_full_ptes_replicated clear_not_present_full_ptes + +#define ptep_get_and_clear_replicated ptep_get_and_clear +#define ptep_clear_flush_replicated ptep_clear_flush +#define ptep_modify_prot_start_replicated ptep_modify_prot_start +#define ptep_modify_prot_commit_replicated ptep_modify_prot_commit +#define ptep_clear_young_notify_replicated ptep_clear_young_notify +#define ptep_clear_flush_young_notify_replicated ptep_clear_flush_young_notify + +#define ptep_set_access_flags_replicated ptep_set_access_flags +#define clear_young_dirty_ptes_replicated clear_young_dirty_ptes + +#endif /* CONFIG_USER_REPLICATION */ + +#endif /* _LINUX_NUMA_USER_REPLICATION_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7a67d997eecea..e784fca29060b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -139,6 +139,10 @@ enum pageflags { #ifdef CONFIG_DYNAMIC_POOL PG_pool, /* Page is allocated from dynamic pool */ #endif + +#ifdef CONFIG_KERNEL_REPLICATION + PG_replicated, +#endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, @@ -635,6 +639,12 @@ PAGEFLAG(Pool, pool, PF_NO_TAIL) PAGEFLAG_FALSE(Pool, pool) #endif +#ifdef CONFIG_KERNEL_REPLICATION +PAGEFLAG(Replicated, replicated, PF_ANY) +#else +PAGEFLAG_FALSE(Replicated, replicated) +#endif + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; @@ -1092,6 +1102,12 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) #define __PG_MLOCKED 0 #endif +#ifdef CONFIG_KERNEL_REPLICATION +#define __PG_REPLICATED (1UL << PG_replicated) +#else +#define __PG_REPLICATED 0 +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. If they are, there is a problem. @@ -1101,7 +1117,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + 1UL << PG_unevictable | __PG_REPLICATED | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 6104fa2b6e477..9658b8f17c393 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -51,7 +51,8 @@ gfpflag_string(__GFP_DIRECT_RECLAIM), \ gfpflag_string(__GFP_KSWAPD_RECLAIM), \ gfpflag_string(__GFP_ZEROTAGS), \ - gfpflag_string(__GFP_RELIABLE) + gfpflag_string(__GFP_RELIABLE), \ + gfpflag_string(__GFP_MAYDIE) #ifdef CONFIG_KASAN_HW_TAGS #define __def_gfpflag_names_kasan , \ @@ -96,6 +97,12 @@ #define IF_HAVE_PG_POOL(_name) #endif +#ifdef CONFIG_KERNEL_REPLICATION +#define IF_HAVE_PG_REPLICATED(_name) ,{1UL << PG_##_name, __stringify(_name)} +#else +#define IF_HAVE_PG_REPLICATED(_name) +#endif + #ifdef CONFIG_ARCH_USES_PG_ARCH_X #define IF_HAVE_PG_ARCH_X(_name) ,{1UL << PG_##_name, __stringify(_name)} #else @@ -132,6 +139,7 @@ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ IF_HAVE_PG_IDLE(young) \ IF_HAVE_PG_POOL(pool) \ +IF_HAVE_PG_REPLICATED(replicated) \ IF_HAVE_PG_ARCH_X(arch_2) \ IF_HAVE_PG_ARCH_X(arch_3) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 14e5498efd7ac..26e2856929032 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -16,6 +16,7 @@ #define PROT_NONE 0x0 /* page can not be accessed */ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ +#define PROT_REPLICA 0x200000 /* VM_REPLICA_COMMIT make replicated pte entries to point to copied numa-local physical pages */ /* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ @@ -29,6 +30,7 @@ #define MAP_HUGETLB 0x040000 /* create a huge page mapping */ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#define MAP_REPLICA 0x200000 /* VM_REPLICA_INIT */ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c26a9b3a35768..d0362c46b1480 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -352,7 +352,7 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static bool cgroup_has_tasks(struct cgroup *cgrp) +bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1060cf1524370..5face8ad9d3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> +#include <linux/numa_user_replication.h> #include <linux/uprobes.h> @@ -194,9 +195,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); - ptep_clear_flush(vma, addr, pvmw.pte); + ptep_clear_flush_replicated(vma, addr, pvmw.pte); if (new_page) - set_pte_at_notify(mm, addr, pvmw.pte, + set_pte_at_notify_replicated(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); add_reliable_folio_counter(old_folio, mm, -1); diff --git a/kernel/fork.c b/kernel/fork.c index f30b24c68442b..5b2a9806b4f88 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> +#include <linux/numa_user_replication.h> #include <linux/stackprotector.h> #include <linux/user_events.h> #include <linux/iommu.h> @@ -749,6 +750,21 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, */ if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); +#ifdef CONFIG_USER_REPLICATION + /* + * clear VM_REPLICA_... flags in case of discarding replication + */ + if (memcg_replication_enabled(oldmm) && !memcg_replication_enabled(mm)) { + vm_flags_clear(tmp, VM_REPLICA_INIT | VM_REPLICA_COMMIT); + } + /* + * VM_LOCKED_MASK was cleared above, so tmp may possibly become + * might_be_replicated. + */ + else if (memcg_replication_enabled(mm) && vma_might_be_replicated(tmp)) { + vm_flags_mod(tmp, VM_REPLICA_COMMIT, VM_NONE); + } +#endif /* * Link the vma into the MT. After using __mt_dup(), memory @@ -1320,6 +1336,30 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static void mm_init_numa_replication_state(struct mm_struct *mm) +{ +#ifdef CONFIG_USER_REPLICATION + switch (mm->fork_policy) { + case FORK_NO_REPLICA: + mm->cg_user_replication_active = + get_mem_cgroup_from_mm(mm)->user_replication_active; + if (mm->cg_user_replication_active) + mm->fork_policy = FORK_KEEP_REPLICA; + else + mm->fork_policy = FORK_DISCARD_REPLICA; + break; + case FORK_DISCARD_REPLICA: + mm->cg_user_replication_active = 0; + break; + case FORK_KEEP_REPLICA: + BUG_ON(!mm->cg_user_replication_active); + break; + default: + BUG(); + } +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1352,6 +1392,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); + mm_init_numa_replication_state(mm); hugetlb_count_init(mm); if (current->mm) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be1d355491449..cfee7847b1087 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -48,6 +48,7 @@ #include <linux/ratelimit.h> #include <linux/task_work.h> #include <linux/rbtree_augmented.h> +#include <linux/numa_user_replication.h> #include <asm/switch_to.h> @@ -3428,7 +3429,7 @@ static void task_numa_work(struct callback_head *work) } for (; vma; vma = vma_next(&vmi)) { - if (!vma_migratable(vma) || !vma_policy_mof(vma) || + if (!vma_migratable(vma) || (!vma_policy_mof(vma) && !vma_has_replicas(vma)) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; } @@ -3439,8 +3440,9 @@ static void task_numa_work(struct callback_head *work) * hinting faults in read-only file-backed mappings or the vdso * as migrating the pages will be of marginal benefit. */ - if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + if ((!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) && + !vma_has_replicas(vma)) continue; /* diff --git a/mm/Kconfig b/mm/Kconfig index 845ff9619d3ef..57a23b7379852 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1308,6 +1308,19 @@ config KERNEL_REPLICATION Page tables are replicated partially, according to replicated kernel memory range. If unsure, say "n". +config USER_REPLICATION + bool "infrastructure for userspace replication between NUMA nodes" + default n + depends on KERNEL_REPLICATION && !TRANSPARENT_HUGEPAGE + select ARCH_USES_HIGH_VMA_FLAGS + + help + Provide interfaces for per NUMA node replication of some userspace mappings. + First of all it made for text, ro-data and ro after init data. This feature + doesn't support THP now. + + If unsure, say "n". + config IOMMU_MM_DATA bool diff --git a/mm/Makefile b/mm/Makefile index 45058cdf65d89..1dd967c39e937 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_KERNEL_REPLICATION) += numa_kernel_replication.o +obj-$(CONFIG_USER_REPLICATION) += numa_user_replication.o obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_ETMEM) += etmem.o diff --git a/mm/gup.c b/mm/gup.c index 33e8e66b3a773..0e516de0f8b30 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -19,6 +19,7 @@ #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> +#include <linux/numa_user_replication.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -441,7 +442,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { - set_pte_at(vma->vm_mm, address, pte, entry); + set_pte_at_replicated(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } } diff --git a/mm/ksm.c b/mm/ksm.c index dac2b6d5c8298..dd234a8b7f773 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -40,6 +40,7 @@ #include <linux/oom.h> #include <linux/numa.h> #include <linux/pagewalk.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> #include "internal.h" @@ -1138,20 +1139,20 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * * See Documentation/mm/mmu_notifier.rst */ - entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); + entry = ptep_clear_flush_replicated(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (page_mapcount(page) + 1 + swapped != page_count(page)) { - set_pte_at(mm, pvmw.address, pvmw.pte, entry); + set_pte_at_replicated(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(page_folio(page), page)) { - set_pte_at(mm, pvmw.address, pvmw.pte, entry); + set_pte_at_replicated(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } @@ -1162,7 +1163,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_write(entry)) entry = pte_wrprotect(entry); - set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); + set_pte_at_notify_replicated(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; @@ -1263,8 +1264,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * * See Documentation/mm/mmu_notifier.rst */ - ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, newpte); + ptep_clear_flush_replicated(vma, addr, ptep); + set_pte_at_notify_replicated(mm, addr, ptep, newpte); add_reliable_page_counter(page, mm, -1); folio = page_folio(page); @@ -2392,7 +2393,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; - if (!vma->anon_vma) + if (!vma->anon_vma || vma_has_replicas(vma)) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { diff --git a/mm/madvise.c b/mm/madvise.c index e51c1cf8dfca1..c4938a02804fc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -31,6 +31,7 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> +#include <linux/numa_user_replication.h> #include <asm/tlb.h> @@ -503,6 +504,12 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; + /* + * We do not care about replicated pages here, + * they are unevictable and invisible for reclaim anyway + */ + if (folio_test_replicated(folio)) + continue; /* * If we encounter a large folio, only split it if it is not @@ -560,7 +567,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, continue; if (!pageout && pte_young(ptent)) { - clear_young_dirty_ptes(vma, addr, pte, nr, + clear_young_dirty_ptes_replicated(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } @@ -724,10 +731,10 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); + clear_not_present_full_ptes_replicated(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + pte_clear_not_present_full_replicated(mm, addr, pte, tlb->fullmm); } continue; } @@ -735,7 +742,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; - + if (folio_test_replicated(folio)) + continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise @@ -803,7 +811,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } if (pte_young(ptent) || pte_dirty(ptent)) { - clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); + clear_young_dirty_ptes_replicated(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 606a481afe2e2..786e7c9e83dd8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -68,6 +68,7 @@ #include <linux/sched/isolation.h> #include <linux/parser.h> #include <linux/dynamic_pool.h> +#include <linux/numa_user_replication.h> #ifdef CONFIG_MEMCG_SWAP_QOS #include <linux/blkdev.h> @@ -207,6 +208,9 @@ static struct move_charge_struct { struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; +#ifdef CONFIG_USER_REPLICATION + struct mem_cgroup *to_repl; +#endif unsigned long flags; unsigned long precharge; unsigned long moved_charge; @@ -4209,6 +4213,11 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, if (val & ~MOVE_MASK) return -EINVAL; +#ifdef CONFIG_USER_REPLICATION + if (memcg->user_replication_active) + return -EINVAL; +#endif + /* * No kind of locking is needed in here, because ->can_attach() will * check this value once in the beginning of the process, and then carry @@ -6125,6 +6134,46 @@ static ssize_t wb_blkio_write(struct kernfs_open_file *of, char *buf, } #endif +#ifdef CONFIG_USER_REPLICATION +static int memory_numa_replication_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + int replication_mode = READ_ONCE(memcg->user_replication_active); + + if (replication_mode) { + seq_printf(m, "enabled\n"); + } else { + seq_printf(m, "disabled\n"); + } + + return 0; +} + +static ssize_t memory_numa_replication_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup *cgrp = of_css(of)->cgroup; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + if (cgroup_has_tasks(cgrp) || !is_text_replicated()) + return -EINVAL; + + if (!strcmp(buf, "enable")) { + WRITE_ONCE(memcg->move_charge_at_immigrate, MOVE_MASK); + WRITE_ONCE(memcg->user_replication_active, true); + } else if (!strcmp(buf, "disable")) { + WRITE_ONCE(memcg->user_replication_active, false); + } else { + return -EINVAL; + } + + return nbytes; +} +#endif static struct cftype mem_cgroup_legacy_files[] = { { @@ -6363,6 +6412,14 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_dpool_2M_write, .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, }, +#endif +#ifdef CONFIG_USER_REPLICATION + { + .name = "numa_replication", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_numa_replication_show, + .write = memory_numa_replication_write, + }, #endif { }, /* terminate */ }; @@ -6616,6 +6673,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #endif #ifdef CONFIG_MEMCG_V1_RECLAIM memcg->high_async_ratio = HIGH_ASYNC_RATIO_BASE; +#endif +#ifdef CONFIG_USER_REPLICATION + memcg->user_replication_active = false; #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { @@ -7386,12 +7446,29 @@ static void mem_cgroup_clear_mc(void) spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; +#ifdef CONFIG_USER_REPLICATION + mc.to_repl = NULL; +#endif mc.mm = NULL; spin_unlock(&mc.lock); mmput(mm); } +#ifdef CONFIG_USER_REPLICATION +static void mem_cgroup_clear_mc_replicated(void) +{ + struct mm_struct *mm = mc.mm; + + spin_lock(&mc.lock); + mc.to_repl = NULL; + mc.mm = NULL; + spin_unlock(&mc.lock); + + mmput(mm); +} +#endif + static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; @@ -7401,10 +7478,14 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) struct mm_struct *mm; unsigned long move_flags; int ret = 0; - +#ifdef CONFIG_USER_REPLICATION + /* charge immigration isn't supported on the default hierarchy */ + bool bother_charge = !cgroup_subsys_on_dfl(memory_cgrp_subsys); +#else /* charge immigration isn't supported on the default hierarchy */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; +#endif /* * Multi-process migrations only happen on the default hierarchy @@ -7431,8 +7512,13 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) * So we need to save it, and keep it going. */ move_flags = READ_ONCE(memcg->move_charge_at_immigrate); +#ifdef CONFIG_USER_REPLICATION + if (bother_charge) + bother_charge = !!(move_flags); +#else if (!move_flags) return 0; +#endif from = mem_cgroup_from_task(p); @@ -7441,6 +7527,35 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) mm = get_task_mm(p); if (!mm) return 0; +#ifdef CONFIG_USER_REPLICATION + if (mm->owner == p) { + VM_BUG_ON(mc.to_repl); + if (bother_charge) { + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + } + + spin_lock(&mc.lock); + mc.mm = mm; + mc.to_repl = memcg; + if (bother_charge) { + mc.from = from; + mc.to = memcg; + mc.flags = move_flags; + /* We set mc.moving_task later */ + } + spin_unlock(&mc.lock); + + if (bother_charge) { + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); + } + } +#else /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { VM_BUG_ON(mc.from); @@ -7460,7 +7575,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - } else { + } +#endif + else { mmput(mm); } return ret; @@ -7470,6 +7587,10 @@ static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); +#ifdef CONFIG_USER_REPLICATION + else if (mc.to_repl) + mem_cgroup_clear_mc_replicated(); +#endif } static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, @@ -7628,9 +7749,21 @@ static void mem_cgroup_move_charge(void) static void mem_cgroup_move_task(void) { if (mc.to) { +#ifdef CONFIG_USER_REPLICATION + BUG_ON(mc.to_repl != mc.to); + numa_mm_handle_replication(mc.mm, + mc.to_repl->user_replication_active, FORK_KEEP_REPLICA); +#endif mem_cgroup_move_charge(); mem_cgroup_clear_mc(); } +#ifdef CONFIG_USER_REPLICATION + else if (mc.to_repl) { + numa_mm_handle_replication(mc.mm, + mc.to_repl->user_replication_active, FORK_KEEP_REPLICA); + mem_cgroup_clear_mc_replicated(); + } +#endif } #else /* !CONFIG_MMU */ diff --git a/mm/memory.c b/mm/memory.c index f05772babfe08..d317873a0dc46 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -79,7 +79,7 @@ #include <linux/sched/sysctl.h> #include <linux/userswap.h> #include <linux/dynamic_pool.h> -#include <linux/numa_kernel_replication.h> +#include <linux/numa_user_replication.h> #include <trace/events/kmem.h> @@ -187,142 +187,232 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member) } #ifdef CONFIG_KERNEL_REPLICATION +#ifdef CONFIG_USER_REPLICATION -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, - unsigned long addr) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { unsigned long offset; - int nid; + struct ptdesc *curr, *tmp; + pmd_t *curr_pmd; + pte_t *curr_pte; + pgtable_t token = pmd_pgtable(*pmd); + bool pmd_replicated = numa_pgtable_replicated(pmd); + bool pte_replicated = numa_pgtable_replicated(page_to_virt(token)); + + pmd_clear(pmd); + if (pmd_replicated) + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { + pmd_clear(curr_pmd); + } + + if (pte_replicated) { + for_each_pgtable_replica_safe(curr, tmp, curr_pte, page_to_virt(token), offset) { + cleanup_pte_list(ptdesc_page(curr)); + pte_free_tlb(tlb, ptdesc_page(curr), addr); + mm_dec_nr_ptes(tlb->mm); + } + } + cleanup_pte_list(token); + pte_free_tlb(tlb, token, addr); + mm_dec_nr_ptes(tlb->mm); +} + +static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr) +{ + unsigned long offset; + struct ptdesc *curr, *tmp; + pud_t *curr_pud; + pmd_t *curr_pmd; + pmd_t *pmd = pmd_offset(pud, addr); + bool pud_replicated = numa_pgtable_replicated(pud); + bool pmd_replicated = numa_pgtable_replicated(pmd); + + pud_clear(pud); + if (pud_replicated) + for_each_pgtable_replica(curr, curr_pud, pud, offset) { + pud_clear(curr_pud); + } + + if (pmd_replicated) { + for_each_pgtable_replica_safe(curr, tmp, curr_pmd, pmd, offset) { + cleanup_pmd_list(curr); + pmd_free_tlb(tlb, curr_pmd, addr); + mm_dec_nr_pmds(tlb->mm); + } + } + cleanup_pmd_list(virt_to_ptdesc(pmd)); + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr) +{ + unsigned long offset; + struct ptdesc *curr, *tmp; + p4d_t *curr_p4d; + pud_t *curr_pud; + pud_t *pud = pud_offset(p4d, addr); + bool p4d_replicated = numa_pgtable_replicated(p4d); + bool pud_replicated = numa_pgtable_replicated(pud); + + p4d_clear(p4d); + if (p4d_replicated) + for_each_pgtable_replica(curr, curr_p4d, p4d, offset) { + p4d_clear(curr_p4d); + } + + if (pud_replicated) { + for_each_pgtable_replica_safe(curr, tmp, curr_pud, pud, offset) { + cleanup_pud_list(curr); + pud_free_tlb(tlb, curr_pud, addr); + mm_dec_nr_puds(tlb->mm); + } + } + cleanup_pud_list(virt_to_ptdesc(pud)); + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr) +{ + unsigned long offset; + struct ptdesc *curr, *tmp; + pgd_t *curr_pgd; + p4d_t *curr_p4d; + p4d_t *p4d = p4d_offset(pgd, addr); + bool pgd_replicated = numa_pgtable_replicated(pgd); + bool p4d_replicated = numa_pgtable_replicated(p4d); + + pgd_clear(pgd); + if (pgd_replicated) + for_each_pgtable_replica(curr, curr_pgd, pgd, offset) { + pgd_clear(curr_pgd); + } + + if (p4d_replicated) { + for_each_pgtable_replica_safe(curr, tmp, curr_p4d, p4d, offset) { + cleanup_p4d_list(curr); + p4d_free_tlb(tlb, curr_p4d, addr); + } + } + cleanup_p4d_list(virt_to_ptdesc(p4d)); + p4d_free_tlb(tlb, p4d, addr); +} + +#else + +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) +{ + unsigned long offset; + struct ptdesc *curr; pmd_t *curr_pmd; pgtable_t token = pmd_pgtable(*pmd); + pmd_clear(pmd); if (get_propagation_level() == PMD_PROPAGATION) { - for_each_pgtable_replica(pmd, tlb->mm, curr_pmd, nid, offset) { + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { pmd_clear(curr_pmd); } - } else { - pmd_clear(pmd); } pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); - (void)token; } -static inline void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr) +static inline void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr) { unsigned long offset; - int nid; + struct ptdesc *curr; pud_t *curr_pud; pmd_t *pmd = pmd_offset(pud, addr); + pud_clear(pud); if (get_propagation_level() == PUD_PROPAGATION) { - for_each_pgtable_replica(pud, tlb->mm, curr_pud, nid, offset) { + for_each_pgtable_replica(curr, curr_pud, pud, offset) { pud_clear(curr_pud); } - } else { - pud_clear(pud); } pmd_free_tlb(tlb, pmd, addr); mm_dec_nr_pmds(tlb->mm); - (void)pmd; } -static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, - unsigned long addr) +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr) { unsigned long offset; - int nid; + struct ptdesc *curr; p4d_t *curr_p4d; pud_t *pud = pud_offset(p4d, addr); + p4d_clear(p4d); if (get_propagation_level() == P4D_PROPAGATION) { - for_each_pgtable_replica(p4d, tlb->mm, curr_p4d, nid, offset) { + for_each_pgtable_replica(curr, curr_p4d, p4d, offset) { p4d_clear(curr_p4d); } - } else { - p4d_clear(p4d); } pud_free_tlb(tlb, pud, addr); mm_dec_nr_puds(tlb->mm); - (void)pud; } -static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, - unsigned long addr) +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr) { unsigned long offset; - int nid; + struct ptdesc *curr; pgd_t *curr_pgd; - p4d_t *p4d = p4d_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pgd_clear(pgd); if (get_propagation_level() == PGD_PROPAGATION) { - for_each_pgtable_replica(pgd, tlb->mm, curr_pgd, nid, offset) { + for_each_pgtable_replica(curr, curr_pgd, pgd, offset) { pgd_clear(curr_pgd); } - } else { - pgd_clear(pgd); } + p4d_free_tlb(tlb, p4d, addr); - /* - * Why? If 4-level paging is enabled via kconfig, - * all functions execept p4d_offset are empty, - * and we get unused variable error - */ - (void)p4d; + (void) p4d; } -#else + +#endif /* CONFIG_USER_REPLICATION */ + +#else /* !CONFIG_KERNEL_REPLICATION */ /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, - unsigned long addr) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); - (void)token; } -static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr) +static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr) { pmd_t *pmd = pmd_offset(pud, addr); - pud_clear(pud); pmd_free_tlb(tlb, pmd, addr); mm_dec_nr_pmds(tlb->mm); - (void)pmd; } -static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, - unsigned long addr) +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr) { pud_t *pud = pud_offset(p4d, addr); - p4d_clear(p4d); pud_free_tlb(tlb, pud, addr); mm_dec_nr_puds(tlb->mm); - (void)pud; } -static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, - unsigned long addr) +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr) { p4d_t *p4d = p4d_offset(pgd, addr); - pgd_clear(pgd); p4d_free_tlb(tlb, p4d, addr); - (void)p4d; } -#endif +#endif /* CONFIG_KERNEL_REPLICATION */ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, @@ -518,9 +608,12 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, /* * Optimization: gather nearby vmas into one call down + * We are able to optimize into one call only if all of them are replicated, + * or all of them are not */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { + && !is_vm_hugetlb_page(next) + && (numa_is_vma_replicant(vma) == numa_is_vma_replicant(next))) { vma = next; next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) @@ -871,7 +964,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, */ WARN_ON_ONCE(1); - set_pte_at(vma->vm_mm, address, ptep, pte); + set_pte_at_replicated(vma->vm_mm, address, ptep, pte); /* * No need to invalidate - it was non-present before. However @@ -933,11 +1026,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* Mark the swap entry as shared. */ if (pte_swp_exclusive(orig_pte)) { pte = pte_swp_clear_exclusive(orig_pte); - set_pte_at(src_mm, addr, src_pte, pte); + set_pte_at_replicated(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); + BUG_ON(folio_test_replicated(folio)); rss[mm_counter(folio)]++; @@ -955,11 +1049,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); + set_pte_at_replicated(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); + BUG_ON(folio_test_replicated(folio)); /* * Update rss count even for unaddressable pages, as @@ -989,7 +1084,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); + set_pte_at_replicated(src_mm, addr, src_pte, pte); } } else if (is_device_exclusive_entry(entry)) { /* @@ -1006,13 +1101,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) - set_pte_at(dst_mm, addr, dst_pte, + set_pte_at_replicated(dst_mm, addr, dst_pte, make_pte_marker(marker)); return 0; } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); - set_pte_at(dst_mm, addr, dst_pte, pte); + set_pte_at_replicated(dst_mm, addr, dst_pte, pte); return 0; } @@ -1060,7 +1155,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + set_pte_at_replicated(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -1072,7 +1167,7 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, /* If it's a COW mapping, write protect it both processes. */ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { - wrprotect_ptes(src_mm, addr, src_pte, nr); + wrprotect_ptes_replicated(src_mm, addr, src_pte, nr); pte = pte_wrprotect(pte); } @@ -1084,9 +1179,31 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); - set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); + set_ptes_replicated(dst_vma->vm_mm, addr, dst_pte, pte, nr); } +#ifdef CONFIG_USER_REPLICATION +static __always_inline void __copy_present_ptes_replicated(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, + pte_t pte, pte_t *pte_numa, unsigned long addr) +{ + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_pte; + bool start; + int nid; + + BUG_ON(is_cow_mapping(src_vma->vm_flags) && pte_write(pte)); + BUG_ON(src_vma->vm_flags & VM_SHARED); + + for_each_memory_node(nid) + pte_numa[nid] = pte_mkold(pte_numa[nid]); + + for_each_pgtable(curr, curr_pte, dst_pte, nid, offset, start) + set_pte_at(dst_vma->vm_mm, addr, curr_pte, pte_numa[nid]); +} +#endif + /* * Copy one present PTE, trying to batch-process subsequent PTEs that map * consecutive pages of the same folio by copying them as well. @@ -1110,6 +1227,36 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma goto copy_pte; folio = page_folio(page); + BUG_ON(folio_test_replicated(folio) && folio_test_large(folio)); + +#ifdef CONFIG_USER_REPLICATION + if (folio_test_replicated(folio)) { + pte_t pte_numa[MAX_NUMNODES]; + unsigned long offset; + bool start; + struct ptdesc *curr; + pte_t *curr_pte; + int nid; + + if (!memcg_replication_enabled(dst_vma->vm_mm)) { + err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + return err ? err : 1; + } + + for_each_pgtable(curr, curr_pte, src_pte, nid, offset, start) { + pte_numa[nid] = ptep_get(curr_pte); + struct page *curr_page = vm_normal_page(src_vma, addr, pte_numa[nid]); + get_page(curr_page); + rss[MM_ANONPAGES]++; + add_reliable_page_counter(curr_page, dst_vma->vm_mm, 1); + } + + __copy_present_ptes_replicated(dst_vma, src_vma, dst_pte, src_pte, + pte, pte_numa, addr); + return 1; + } +#endif /* * If we likely have to copy, just don't bother with batching. Make @@ -1224,7 +1371,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, * (whereas vma_needs_copy() skips areas without anon_vma). A rework * can remove such assumptions later, but this is good enough for now. */ - dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + dst_pte = cpr_alloc_pte_map_lock(dst_mm, addr, src_pmd, dst_pmd, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; @@ -1351,7 +1498,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *src_pmd, *dst_pmd; unsigned long next; - dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + dst_pmd = cpr_alloc_pmd(dst_mm, addr, src_pud, dst_pud); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); @@ -1388,7 +1535,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *src_pud, *dst_pud; unsigned long next; - dst_pud = pud_alloc(dst_mm, dst_p4d, addr); + dst_pud = cpr_alloc_pud(dst_mm, addr, src_p4d, dst_p4d); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); @@ -1424,7 +1571,7 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, p4d_t *src_p4d, *dst_p4d; unsigned long next; - dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); + dst_p4d = cpr_alloc_p4d(dst_mm, addr, src_pgd, dst_pgd); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); @@ -1603,29 +1750,52 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, - struct vm_area_struct *vma, struct folio *folio, - struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, + struct vm_area_struct *vma, struct folio *folio, struct page *page, + struct pgtable_private *ptes, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; + int nid; if (!folio_test_anon(folio)) { - ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); - if (pte_dirty(ptent)) { - folio_mark_dirty(folio); - if (tlb_delay_rmap(tlb)) { - delay_rmap = true; - *force_flush = true; + if (ptes->pte_replicated) { + for_each_memory_node(nid) { + pte = ptes->pte_numa[nid]; + ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + if (pte_dirty(ptent)) { + folio_mark_dirty(folio); + if (tlb_delay_rmap(tlb)) { + delay_rmap = true; + *force_flush = true; + } + } + if (pte_young(ptent) && likely(vma_has_recency(vma))) { + folio_mark_accessed(folio); + } + } + } else { + ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + if (pte_dirty(ptent)) { + folio_mark_dirty(folio); + if (tlb_delay_rmap(tlb)) { + delay_rmap = true; + *force_flush = true; + } } + if (pte_young(ptent) && likely(vma_has_recency(vma))) + folio_mark_accessed(folio); } - if (pte_young(ptent) && likely(vma_has_recency(vma))) - folio_mark_accessed(folio); rss[mm_counter(folio)] -= nr; } else { /* We don't need up-to-date accessed/dirty bits. */ - clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + if (ptes->pte_replicated) { + for_each_memory_node(nid) + clear_full_ptes(mm, addr, ptes->pte_numa[nid], nr, tlb->fullmm); + } else { + clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + } rss[MM_ANONPAGES] -= nr; } add_reliable_folio_counter(folio, mm, -nr); @@ -1650,13 +1820,47 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, } } +#ifdef CONFIG_USER_REPLICATION +static __always_inline void zap_present_folio_ptes_replicated(struct mmu_gather *tlb, + struct vm_area_struct *vma, struct pgtable_private *ptes, + pte_t *pte, pte_t ptent, unsigned long addr, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break) +{ + struct mm_struct *mm = tlb->mm; + int nid; + + for_each_memory_node(nid) { + pte = ptes->pte_numa[nid]; + ptent = ptep_get(pte); + ptes->replica_pages[nid] = vm_normal_page(vma, addr, ptent); + + /* We don't need up-to-date accesses/dirty bits. */ + clear_full_ptes(mm, addr, pte, 1, tlb->fullmm); + rss[MM_ANONPAGES]--; + add_reliable_page_counter(ptes->replica_pages[nid], mm, -1); + } + + /* Checking a single PTE in a batch is sufficient. */ + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(userfaultfd_pte_wp(vma, ptent))) + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + + if (unlikely(__tlb_remove_replica_pages(tlb, ptes->replica_pages))) { + *force_flush = true; + *force_break = true; + } +} +#endif + /* * Zap or skip at least one present PTE, trying to batch-process subsequent * PTEs that map consecutive pages of the same folio. * * Returns the number of processed (skipped or zapped) PTEs (at least 1). */ -static inline int zap_present_ptes(struct mmu_gather *tlb, +static inline int zap_present_ptes(struct mmu_gather *tlb, struct pgtable_private *ptes, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, @@ -1670,8 +1874,14 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, page = vm_normal_page(vma, addr, ptent); if (!page) { + int nid; /* We don't need up-to-date accessed/dirty bits. */ - ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + if (ptes->pte_replicated) { + for_each_memory_node(nid) + ptep_get_and_clear_full(mm, addr, ptes->pte_numa[nid], tlb->fullmm); + } else { + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + } arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) @@ -1685,6 +1895,16 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, if (unlikely(!should_zap_folio(details, folio))) return 1; +#ifdef CONFIG_USER_REPLICATION + if (folio_test_replicated(folio)) { + BUG_ON(!ptes->pte_replicated); + + zap_present_folio_ptes_replicated(tlb, vma, ptes, pte, ptent, addr, + details, rss, force_flush, force_break); + return 1; + } +#endif + /* * Make sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. @@ -1693,12 +1913,12 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, NULL, NULL, NULL); - zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, + zap_present_folio_ptes(tlb, vma, folio, page, ptes, pte, ptent, nr, addr, details, rss, force_flush, force_break); return nr; } - zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, + zap_present_folio_ptes(tlb, vma, folio, page, ptes, pte, ptent, 1, addr, details, rss, force_flush, force_break); return 1; } @@ -1710,6 +1930,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, { bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; + struct pgtable_private ptes; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; @@ -1722,6 +1943,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return addr; + pgtable_update_pte(&ptes, pte); flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); @@ -1730,6 +1952,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct folio *folio; struct page *page; int max_nr; + int nid; nr = 1; if (pte_none(ptent)) @@ -1740,7 +1963,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_present(ptent)) { max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, + nr = zap_present_ptes(tlb, &ptes, vma, pte, ptent, max_nr, addr, details, rss, &force_flush, &force_break); if (unlikely(force_break)) { @@ -1755,6 +1978,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, is_device_exclusive_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); + BUG_ON(folio_test_replicated(folio)); if (unlikely(!should_zap_folio(details, folio))) continue; /* @@ -1779,6 +2003,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); + BUG_ON(folio_test_replicated(folio)); if (!should_zap_folio(details, folio)) continue; rss[mm_counter(folio)]--; @@ -1802,9 +2027,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, /* We should have covered all the swap entry types */ WARN_ON_ONCE(1); } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); + + if (ptes.pte_replicated) { + for_each_memory_node(nid) + clear_not_present_full_ptes(mm, addr, ptes.pte_numa[nid], nr, tlb->fullmm); + } else { + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); + } zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); - } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + } while (pgtable_pte_step(&ptes, nr), pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -2140,7 +2371,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, folio_get(folio); inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); - set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); + set_pte_at_replicated(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -2432,7 +2663,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, } entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, addr, pte, entry, 1)) + if (ptep_set_access_flags_replicated(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; @@ -2449,7 +2680,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, entry = maybe_mkwrite(pte_mkdirty(entry), vma); } - set_pte_at(mm, addr, pte, entry); + set_pte_at_replicated(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ out_unlock: @@ -2647,7 +2878,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, err = -EACCES; break; } - set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + set_pte_at_replicated(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -3185,7 +3416,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, } entry = pte_mkyoung(vmf->orig_pte); - if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) + if (ptep_set_access_flags_replicated(vma, addr, vmf->pte, entry, 0)) update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); } @@ -3363,7 +3594,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + if (ptep_set_access_flags_replicated(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); @@ -3527,7 +3758,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * that left a window where the new PTE could be loaded into * some TLBs while the old PTE remains in others. */ - ptep_clear_flush(vma, vmf->address, vmf->pte); + ptep_clear_flush_replicated(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); /* @@ -3536,7 +3767,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * new page to be mapped directly into the secondary page table. */ BUG_ON(unshare && pte_write(entry)); - set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + set_pte_at_notify_replicated(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* @@ -3792,8 +4023,10 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); - if (vmf->page) + if (vmf->page) { folio = page_folio(vmf->page); + BUG_ON(folio_test_replicated(folio)); + } /* * Shared mapping: we are guaranteed to have VM_WRITE and @@ -4051,7 +4284,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * So is_pte_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) - pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); + pte_clear_replicated(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -4378,6 +4611,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); + BUG_ON(folio_test_replicated(folio)); + /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. @@ -4481,7 +4716,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); - set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); + set_ptes_replicated(vma->vm_mm, address, ptep, pte, nr_pages); arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages); @@ -4660,7 +4895,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && - !mm_forbids_zeropage(vma->vm_mm)) { + !mm_forbids_zeropage(vma->vm_mm) && !vma_has_replicas(vma)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, @@ -4739,7 +4974,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) setpte: if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); - set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); + set_ptes_replicated(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); @@ -4948,7 +5183,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, } else { folio_add_file_rmap_ptes(folio, page, nr, vma); } - set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); + set_ptes_replicated(vma->vm_mm, addr, vmf->pte, entry, nr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); @@ -5004,6 +5239,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } + BUG_ON(PageReplicated(page)); + if (pmd_none(*vmf->pmd)) { if (PageTransCompound(page)) { ret = do_set_pmd(vmf, page); @@ -5374,12 +5611,12 @@ static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_str { pte_t pte, old_pte; - old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); + old_pte = ptep_modify_prot_start_replicated(vma, fault_addr, fault_pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) pte = pte_mkwrite(pte, vma); - ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); + ptep_modify_prot_commit_replicated(vma, fault_addr, fault_pte, old_pte, pte); update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); } @@ -5422,6 +5659,37 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru } } +#ifdef CONFIG_USER_REPLICATION +static int numa_replicate_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vmf->vma->vm_mm; + struct mmu_gather tlb; + unsigned long start = vmf->address & PAGE_MASK; + unsigned long end = start + PAGE_SIZE; + int ret = 1; + + /* + * This should not be possible, + * because we have just handled page fault up to pmd level, + * so pmd tables must exist and be replicated. + * In fact, even pte level tables must be replicated at this point. + */ + BUG_ON(pmd_none(*vmf->pmd) || !numa_pgtable_replicated(vmf->pmd)); + + tlb_gather_mmu(&tlb, mm); + tlb_start_vma(&tlb, vma); + + if (phys_duplicate_pte_range(&tlb, vma, vmf->pmd, start, end) != end) + ret = 0; + + tlb_end_vma(&tlb, vma); + tlb_finish_mmu(&tlb); + + return ret; +} +#endif + static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -5462,6 +5730,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; + BUG_ON(folio_test_replicated(folio)); + /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -5491,14 +5761,48 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) else last_cpupid = folio_last_cpupid(folio); target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); - if (target_nid == NUMA_NO_NODE) { + if (target_nid == NUMA_NO_NODE && !vma_has_replicas(vma)) { folio_put(folio); goto out_map; } + + if (vma_has_replicas(vma)) + BUG_ON(!numa_pgtable_replicated(vmf->pte)); + pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; ignore_writable = true; +#ifdef CONFIG_USER_REPLICATION + if (vma_has_replicas(vma)) { + /* Drop the reference count that was elevated in numa_migrate_prep() */ + folio_put(folio); + + if (numa_replicate_page(vmf)) { + vmf->replica_action = REPLICA_NONE; + if (target_nid != NUMA_NO_NODE) + nid = target_nid; + flags |= TNF_FAULT_LOCAL; + task_numa_fault(last_cpupid, nid, 1, flags); + return 0; + } + + /* + * Checking for spurious numa-fault. + * See kernel/sched/fair.c:task_numa_work() for the same if-statement. + */ + if (vma->vm_file && ((vma->vm_flags & (VM_READ|VM_WRITE)) == VM_READ)) { + /* do not task_numa_fault() in the end of function */ + nid = NUMA_NO_NODE; + goto out_unmap; + } + + if (target_nid == NUMA_NO_NODE) { + goto out_unmap; + } + } +#endif + /* Migrate to the requested node */ if (migrate_misplaced_folio(folio, vma, target_nid)) { nid = target_nid; @@ -5506,8 +5810,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) task_numa_fault(last_cpupid, nid, 1, flags); return 0; } - flags |= TNF_MIGRATE_FAIL; + +#ifdef CONFIG_USER_REPLICATION +out_unmap: +#endif vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) @@ -5680,7 +5987,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + if (ptep_set_access_flags_replicated(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); @@ -5726,12 +6033,12 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, p4d_t *p4d; vm_fault_t ret; - pgd = pgd_offset(mm, address); - p4d = p4d_alloc(mm, pgd, address); + pgd = fault_pgd_offset(&vmf, address); + p4d = fault_p4d_alloc(&vmf, mm, pgd, address); if (!p4d) return VM_FAULT_OOM; - vmf.pud = pud_alloc(mm, p4d, address); + vmf.pud = fault_pud_alloc(&vmf, mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; retry_pud: @@ -5763,7 +6070,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, } } - vmf.pmd = pmd_alloc(mm, vmf.pud, address); + vmf.pmd = fault_pmd_alloc(&vmf, mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; @@ -5804,6 +6111,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, } } + if (fault_pte_alloc(&vmf)) + return VM_FAULT_OOM; return handle_pte_fault(&vmf); } @@ -6847,6 +7156,7 @@ void __init ptlock_cache_init(void) SLAB_PANIC, NULL); } +#ifdef CONFIG_KERNEL_REPLICATION bool ptlock_alloc(struct ptdesc *ptdesc) { spinlock_t *ptl; @@ -6855,13 +7165,33 @@ bool ptlock_alloc(struct ptdesc *ptdesc) if (!ptl) return false; ptdesc->ptl = ptl; + ptdesc->master_table = ptdesc; return true; } void ptlock_free(struct ptdesc *ptdesc) { kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + ptdesc->master_table = NULL; } +#else +bool ptlock_alloc(struct ptdesc *ptdesc) +{ + spinlock_t *ptl; + + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); + if (!ptl) + return false; + ptdesc->ptl = ptl; + return true; +} + +void ptlock_free(struct ptdesc *ptdesc) +{ + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); +} +#endif + #endif /** diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f4dfeb5f052f2..0b7f26c54b2df 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -540,6 +540,11 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, continue; if (!queue_folio_required(folio, qp)) continue; + /* + * If vma contains replicated memory, we are not going to move these pages. + */ + if (folio_test_replicated(folio)) + continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* * MPOL_MF_STRICT must be specified if we get here. diff --git a/mm/migrate.c b/mm/migrate.c index 05538e2edd1b7..e1d88bdbab3cf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -51,6 +51,7 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/dynamic_pool.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> @@ -264,7 +265,7 @@ static bool remove_migration_pte(struct folio *folio, pvmw.address, rmap_flags); else folio_add_file_rmap_pte(folio, new, vma); - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + set_pte_at_replicated(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); @@ -2157,7 +2158,8 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, err = -EFAULT; vma = vma_lookup(mm, addr); - if (!vma || !vma_migratable(vma)) + /* if page belongs to fully replicated vma, we don't want to move it here */ + if (!vma || !vma_migratable(vma) || vma_has_replicas(vma)) goto out; /* FOLL_DUMP to ignore special (like zero) pages */ @@ -2358,7 +2360,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, int err = -EFAULT; vma = vma_lookup(mm, addr); - if (!vma) + /* Skip fully replicated vmas */ + if (!vma || vma_has_replicas(vma)) goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ @@ -2632,6 +2635,8 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, unsigned int nr_succeeded; LIST_HEAD(migratepages); + BUG_ON(folio_test_replicated(folio)); + /* * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 58636163731af..d090ed9f73b4a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -14,6 +14,7 @@ #include <linux/pagewalk.h> #include <linux/rmap.h> #include <linux/swapops.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> #include "internal.h" @@ -200,17 +201,17 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); if (anon_exclusive) { - pte = ptep_clear_flush(vma, addr, ptep); + pte = ptep_clear_flush_replicated(vma, addr, ptep); if (folio_try_share_anon_rmap_pte(folio, page)) { - set_pte_at(mm, addr, ptep, pte); + set_pte_at_replicated(mm, addr, ptep, pte); folio_unlock(folio); folio_put(folio); mpfn = 0; goto next; } } else { - pte = ptep_get_and_clear(mm, addr, ptep); + pte = ptep_get_and_clear_replicated(mm, addr, ptep); } migrate->cpages++; @@ -247,7 +248,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (pte_swp_uffd_wp(pte)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } - set_pte_at(mm, addr, ptep, swp_pte); + set_pte_at_replicated(mm, addr, ptep, swp_pte); /* * This is like regular unmap: we remove the rmap and @@ -534,6 +535,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (args->fault_page && !is_device_private_page(args->fault_page)) return -EINVAL; + if (vma_has_replicas(args->vma)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -664,12 +667,12 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); - ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); + ptep_clear_flush_replicated(vma, addr, ptep); + set_pte_at_notify_replicated(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); } else { /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); + set_pte_at_replicated(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); } diff --git a/mm/mlock.c b/mm/mlock.c index cd0997d89c7c5..efbfbb38c718f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -25,6 +25,7 @@ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/secretmem.h> +#include <linux/numa_user_replication.h> #include "internal.h" @@ -490,6 +491,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; +#ifdef CONFIG_USER_REPLICATION + if (!(newflags & VM_REPLICA_COMMIT) && vma_has_replicas(vma)) { + if ((ret = phys_deduplicate(vma, start, end - start, true))) + goto out; + } +#endif + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(vmi, mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), @@ -570,6 +578,18 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; + +#ifdef CONFIG_USER_REPLICATION + if (vma->vm_mm && memcg_replication_enabled(vma->vm_mm)) { + WARN_ON_ONCE(!numa_is_vma_replicant(vma)); + newflags |= VM_REPLICA_INIT; + if (vmflags_might_be_replicated(newflags)) + newflags |= VM_REPLICA_COMMIT; + else + newflags &= ~VM_REPLICA_COMMIT; + } +#endif + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) @@ -767,6 +787,17 @@ static int apply_mlockall_flags(int flags) newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; +#ifdef CONFIG_USER_REPLICATION + if (vma->vm_mm && memcg_replication_enabled(vma->vm_mm)) { + WARN_ON_ONCE(!numa_is_vma_replicant(vma)); + newflags |= VM_REPLICA_INIT; + if (vmflags_might_be_replicated(newflags)) + newflags |= VM_REPLICA_COMMIT; + else + newflags &= ~VM_REPLICA_COMMIT; + } +#endif + /* Ignore errors */ mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, newflags); diff --git a/mm/mmap.c b/mm/mmap.c index dfa3d2bfe2891..19d03b6a16591 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -48,6 +48,7 @@ #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/share_pool.h> +#include <linux/numa_user_replication.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -1432,6 +1433,14 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_USER_REPLICATION + if (memcg_replication_enabled(mm) || (flags & MAP_REPLICA)) { + vm_flags |= VM_REPLICA_INIT; + if (vmflags_might_be_replicated(vm_flags)) + vm_flags |= VM_REPLICA_COMMIT; + } +#endif + addr = __mmap_region(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1840,7 +1849,14 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, return addr; } +#ifdef CONFIG_USER_REPLICATION + if (flags & MAP_REPLICA) + info.flags = 0; + else + info.flags = VM_UNMAPPED_AREA_TOPDOWN; +#else info.flags = VM_UNMAPPED_AREA_TOPDOWN; +#endif info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); @@ -3198,6 +3214,14 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Note: This happens *after* clearing old mappings in some code paths. */ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; +#ifdef CONFIG_USER_REPLICATION + if (memcg_replication_enabled(mm)) { + flags |= VM_REPLICA_INIT; + if (vmflags_might_be_replicated(flags)) + flags |= VM_REPLICA_COMMIT; + } +#endif + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; @@ -3666,6 +3690,14 @@ static struct vm_area_struct *__install_special_mapping( vma->vm_ops = ops; vma->vm_private_data = priv; +#ifdef CONFIG_USER_REPLICATION + if (memcg_replication_enabled(mm)) { + __vm_flags_mod(vma, VM_REPLICA_INIT, VM_NONE); + if (vma_might_be_replicated(vma)) + __vm_flags_mod(vma, VM_REPLICA_COMMIT, VM_NONE); + } +#endif + ret = insert_vm_struct(mm, vma); if (ret) goto out; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 99b3e9408aa0f..24cbbd8bb0a93 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -9,6 +9,7 @@ #include <linux/smp.h> #include <linux/swap.h> #include <linux/rmap.h> +#include <linux/numa_user_replication.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -167,6 +168,19 @@ static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, { int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; +#ifndef CONFIG_USER_REPLICATION + /* + * Make sure that we can always add another "page" + "nr_pages", + * requiring two entries instead of only a single one. + */ + const int batch_reserve = 1; +#else + /* + * And also make sure that we can always add fully replicated pages, + * requiring 'mem_nodes' entries. + */ + const int batch_reserve = max(1, num_node_state(N_MEMORY) - 1); +#endif VM_BUG_ON(!tlb->end); @@ -188,19 +202,56 @@ static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, batch->encoded_pages[batch->nr++] = encode_page(page, flags); batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); } + + if (batch->nr >= batch->max - batch_reserve) { + if (!tlb_next_batch(tlb)) + return true; + batch = tlb->active; + } + VM_BUG_ON_PAGE(batch->nr > batch->max - batch_reserve, page); + + return false; +} + +#ifdef CONFIG_USER_REPLICATION +bool __tlb_remove_replica_pages_size(struct mmu_gather *tlb, struct page **pages, + int page_size) +{ + struct mmu_gather_batch *batch; /* * Make sure that we can always add another "page" + "nr_pages", * requiring two entries instead of only a single one. + * + * And also make sure that we can always add fully replicated pages, + * requiring 'mem_nodes' entries. */ - if (batch->nr >= batch->max - 1) { + const int batch_reserve = max(1, num_node_state(N_MEMORY) - 1); + int nid; + + VM_BUG_ON(!tlb->end); + +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE + VM_WARN_ON(tlb->page_size != page_size); +#endif + + batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + for_each_memory_node(nid) + batch->encoded_pages[batch->nr++] = encode_page(pages[nid], 0); + + if (batch->nr >= batch->max - batch_reserve) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); + VM_BUG_ON_PAGE(batch->nr > batch->max - batch_reserve, pages[0]); return false; } +#endif bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap) diff --git a/mm/mprotect.c b/mm/mprotect.c index ed0e21a053398..947be092af22b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -32,6 +32,7 @@ #include <linux/sched/sysctl.h> #include <linux/userfaultfd_k.h> #include <linux/memory-tiers.h> +#include <linux/numa_user_replication.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -80,209 +81,234 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return pte_dirty(pte); } -static long change_pte_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t newprot, unsigned long cp_flags) +static long change_pte_entry(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, unsigned long addr, + pgprot_t newprot, unsigned long cp_flags) { - pte_t *pte, oldpte; - spinlock_t *ptl; + pte_t oldpte = ptep_get(pte); long pages = 0; int target_node = NUMA_NO_NODE; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; - tlb_change_page_size(tlb, PAGE_SIZE); - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (!pte) - return -EAGAIN; - /* Get target node for single threaded private VMAs */ if (prot_numa && !(vma->vm_flags & VM_SHARED) && atomic_read(&vma->vm_mm->mm_users) == 1) target_node = numa_node_id(); - flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); - do { - oldpte = ptep_get(pte); - if (pte_present(oldpte)) { - pte_t ptent; + if (pte_present(oldpte)) { + pte_t ptent; + + /* + * Avoid trapping faults against the zero or KSM + * pages. See similar comment in change_huge_pmd. + */ + if (prot_numa) { + struct folio *folio; + int nid; + bool toptier; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + return pages; + + folio = vm_normal_folio(vma, addr, oldpte); + if (!folio || folio_is_zone_device(folio) || + folio_test_ksm(folio)) + return pages; + + /* Skip fully replicated memory */ + if (folio_test_replicated(folio)) + return pages; + + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + (folio_maybe_dma_pinned(folio) || + folio_likely_mapped_shared(folio))) + return pages; /* - * Avoid trapping faults against the zero or KSM - * pages. See similar comment in change_huge_pmd. + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. */ - if (prot_numa) { - struct folio *folio; - int nid; - bool toptier; - - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - continue; - - folio = vm_normal_folio(vma, addr, oldpte); - if (!folio || folio_is_zone_device(folio) || - folio_test_ksm(folio)) - continue; - - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || - folio_likely_mapped_shared(folio))) - continue; + if (folio_is_file_lru(folio) && + folio_test_dirty(folio)) + return pages; - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (folio_is_file_lru(folio) && - folio_test_dirty(folio)) - continue; + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + nid = folio_nid(folio); + if (target_node == nid) + return pages; + toptier = node_is_toptier(nid); - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - nid = folio_nid(folio); - if (target_node == nid) - continue; - toptier = node_is_toptier(nid); + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + toptier) + return pages; + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, + jiffies_to_msecs(jiffies)); + } - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) - continue; - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); - } + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); - oldpte = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_modify(oldpte, newprot); + if (uffd_wp) + ptent = pte_mkuffd_wp(ptent); + else if (uffd_wp_resolve) + ptent = pte_clear_uffd_wp(ptent); - if (uffd_wp) - ptent = pte_mkuffd_wp(ptent); - else if (uffd_wp_resolve) - ptent = pte_clear_uffd_wp(ptent); + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent) && + can_change_pte_writable(vma, addr, ptent)) + ptent = pte_mkwrite(ptent, vma); + + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, PAGE_SIZE); + pages++; + } else if (is_swap_pte(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; + + if (is_writable_migration_entry(entry)) { + struct folio *folio = pfn_swap_entry_folio(entry); /* - * In some writable, shared mappings, we might want - * to catch actual write access -- see - * vma_wants_writenotify(). - * - * In all writable, private mappings, we have to - * properly handle COW. - * - * In both cases, we can sometimes still change PTEs - * writable and avoid the write-fault handler, for - * example, if a PTE is already dirty and no other - * COW or special handling is required. + * A protection check is difficult so + * just be safe and disable write */ - if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent) && - can_change_pte_writable(vma, addr, ptent)) - ptent = pte_mkwrite(ptent, vma); - - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); - if (pte_needs_flush(oldpte, ptent)) - tlb_flush_pte_range(tlb, addr, PAGE_SIZE); - pages++; - } else if (is_swap_pte(oldpte)) { - swp_entry_t entry = pte_to_swp_entry(oldpte); - pte_t newpte; - - if (is_writable_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); - - /* - * A protection check is difficult so - * just be safe and disable write - */ - if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry( - swp_offset(entry)); - else - entry = make_readable_migration_entry(swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(oldpte)) - newpte = pte_swp_mksoft_dirty(newpte); - } else if (is_writable_device_private_entry(entry)) { - /* - * We do not preserve soft-dirtiness. See - * copy_nonpresent_pte() for explanation. - */ - entry = make_readable_device_private_entry( - swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_writable_device_exclusive_entry(entry)) { - entry = make_readable_device_exclusive_entry( - swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(oldpte)) - newpte = pte_swp_mksoft_dirty(newpte); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_pte_marker_entry(entry)) { - /* - * Ignore error swap entries unconditionally, - * because any access should sigbus anyway. - */ - if (is_poisoned_swp_entry(entry)) - continue; - /* - * If this is uffd-wp pte marker and we'd like - * to unprotect it, drop it; the next page - * fault will trigger without uffd trapping. - */ - if (uffd_wp_resolve) { - pte_clear(vma->vm_mm, addr, pte); - pages++; - } - continue; - } else { - newpte = oldpte; - } - - if (uffd_wp) + if (folio_test_anon(folio)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + } else if (is_writable_device_private_entry(entry)) { + /* + * We do not preserve soft-dirtiness. See + * copy_nonpresent_pte() for explanation. + */ + entry = make_readable_device_private_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - else if (uffd_wp_resolve) - newpte = pte_swp_clear_uffd_wp(newpte); - - if (!pte_same(oldpte, newpte)) { - set_pte_at(vma->vm_mm, addr, pte, newpte); + } else if (is_writable_device_exclusive_entry(entry)) { + entry = make_readable_device_exclusive_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_pte_marker_entry(entry)) { + /* + * Ignore error swap entries unconditionally, + * because any access should sigbus anyway. + */ + if (is_poisoned_swp_entry(entry)) + return pages; + /* + * If this is uffd-wp pte marker and we'd like + * to unprotect it, drop it; the next page + * fault will trigger without uffd trapping. + */ + if (uffd_wp_resolve) { + pte_clear(vma->vm_mm, addr, pte); pages++; } + return pages; } else { - /* It must be an none page, or what else?.. */ - WARN_ON_ONCE(!pte_none(oldpte)); + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); + pages++; + } + } else { + /* It must be an none page, or what else?.. */ + WARN_ON_ONCE(!pte_none(oldpte)); + /* + * Nobody plays with any none ptes besides + * userfaultfd when applying the protections. + */ + if (likely(!uffd_wp)) + return pages; + + if (userfaultfd_wp_use_markers(vma)) { /* - * Nobody plays with any none ptes besides - * userfaultfd when applying the protections. + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the + * pte is none, the page/swap cache could + * exist. Doing that by install a marker. */ - if (likely(!uffd_wp)) - continue; + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + pages++; + } + } - if (userfaultfd_wp_use_markers(vma)) { - /* - * For file-backed mem, we need to be able to - * wr-protect a none pte, because even if the - * pte is none, the page/swap cache could - * exist. Doing that by install a marker. - */ - set_pte_at(vma->vm_mm, addr, pte, - make_pte_marker(PTE_MARKER_UFFD_WP)); - pages++; + return pages; +} + +static long change_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) +{ + pte_t *pte; + spinlock_t *ptl; + long pages = 0; + + tlb_change_page_size(tlb, PAGE_SIZE); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) + return -EAGAIN; + + flush_tlb_batched_pending(vma->vm_mm); + arch_enter_lazy_mmu_mode(); + do { + pages += change_pte_entry(tlb, vma, pte, addr, newprot, cp_flags); +#ifdef CONFIG_USER_REPLICATION + if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct ptdesc *curr; + pte_t *curr_pte; + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + change_pte_entry(tlb, vma, curr_pte, addr, newprot, cp_flags); } } +#endif } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -492,7 +518,7 @@ static long change_protection_range(struct mmu_gather *tlb, long pages = 0, ret; BUG_ON(addr >= end); - pgd = pgd_offset(mm, addr); + pgd = pgd_offset_pgd(this_node_pgd(mm), addr); tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); @@ -627,6 +653,13 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, } } +#ifdef CONFIG_USER_REPLICATION + if (!(newflags & VM_REPLICA_COMMIT) && vma_has_replicas(vma)) { + if ((error = phys_deduplicate(vma, start, end - start, true))) + goto fail; + } +#endif + /* * First try to merge with previous and/or next vma. */ @@ -698,6 +731,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + bool arch_invalid_prot; struct mmu_gather tlb; struct vma_iterator vmi; @@ -715,7 +749,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, end = start + len; if (end <= start) return -ENOMEM; - if (!arch_validate_prot(prot, start)) + arch_invalid_prot = !arch_validate_prot(prot, start); +#ifdef CONFIG_USER_REPLICATION + arch_invalid_prot = arch_invalid_prot && (prot != PROT_REPLICA); +#endif + if (arch_invalid_prot) return -EINVAL; reqprot = prot; @@ -737,6 +775,22 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (!vma) goto out; +#ifdef CONFIG_USER_REPLICATION + if (prot == PROT_REPLICA) { + error = -EINVAL; + if (!vma_might_be_replicated(vma)) + goto out; + + if (!vma_has_replicas(vma)) + vm_flags_set(vma, VM_REPLICA_COMMIT); + error = phys_duplicate(vma, start, len); + if (error) + pr_info("Failed to replicate memory -- start:%zx; len:%zx PID: %d NAME: %s\n", + start, len, current->pid, current->comm); + goto out; + } +#endif + if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; @@ -787,6 +841,17 @@ static int do_mprotect_pkey(unsigned long start, size_t len, newflags = calc_vm_prot_bits(prot, new_vma_pkey); newflags |= (vma->vm_flags & ~mask_off_old_flags); +#ifdef CONFIG_USER_REPLICATION + if (vma->vm_mm && memcg_replication_enabled(vma->vm_mm)) { + WARN_ON_ONCE(!numa_is_vma_replicant(vma)); + newflags |= VM_REPLICA_INIT; + if (vmflags_might_be_replicated(newflags)) + newflags |= VM_REPLICA_COMMIT; + else + newflags &= ~VM_REPLICA_COMMIT; + } +#endif + /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { error = -EACCES; diff --git a/mm/mremap.c b/mm/mremap.c index e990bb8c89181..f6f716106f560 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -27,6 +27,7 @@ #include <linux/mempolicy.h> #include <linux/share_pool.h> #include <linux/userswap.h> +#include <linux/numa_user_replication.h> #include <asm/cacheflush.h> #include <asm/tlb.h> @@ -193,7 +194,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (pte_none(ptep_get(old_pte))) continue; - pte = ptep_get_and_clear(mm, old_addr, old_pte); + pte = ptep_get_and_clear_replicated(mm, old_addr, old_pte); /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the @@ -209,7 +210,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at_replicated(mm, new_addr, new_pte, pte); } arch_leave_lazy_mmu_mode(); @@ -242,6 +243,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, struct mm_struct *mm = vma->vm_mm; bool res = false; pmd_t pmd; +#ifdef CONFIG_USER_REPLICATION + pmd_t pmd_numa[MAX_NUMNODES]; + bool old_pte_replicated = numa_pgtable_replicated(page_to_virt(pmd_pgtable(*old_pmd))); + bool new_pmd_replicated = numa_pgtable_replicated(new_pmd); +#endif if (!arch_supports_page_table_move()) return false; @@ -271,6 +277,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; +#ifdef CONFIG_USER_REPLICATION + /* + * In that case, we need to somehow get rid of page tables replicas of pte level + * I am not sure how to do it properly right now, so fallback to slowpath + */ + if (old_pte_replicated && !new_pmd_replicated) + return false; +#endif + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -280,18 +295,43 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); +#ifdef CONFIG_USER_REPLICATION + if (old_pte_replicated) { + int nid; + unsigned long offset; + struct ptdesc *curr; + pmd_t *curr_pmd; + bool start; + + for_each_pgtable(curr, curr_pmd, old_pmd, nid, offset, start) + pmd_numa[nid] = *(curr_pmd); + } +#endif pmd = *old_pmd; /* Racing with collapse? */ if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) goto out_unlock; /* Clear the pmd */ - pmd_clear(old_pmd); + pmd_clear_replicated(old_pmd); res = true; VM_BUG_ON(!pmd_none(*new_pmd)); - pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); +#ifdef CONFIG_USER_REPLICATION + if (new_pmd_replicated && old_pte_replicated) { + int nid; + unsigned long offset; + struct ptdesc *curr; + pmd_t *curr_pmd; + bool start; + + for_each_pgtable(curr, curr_pmd, new_pmd, nid, offset, start) + pmd_populate(mm, curr_pmd, pmd_pgtable(pmd_numa[nid])); + } else +#endif + pmd_populate_replicated(mm, new_pmd, pmd_pgtable(pmd)); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); out_unlock: if (new_ptl != old_ptl) @@ -316,6 +356,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, spinlock_t *old_ptl, *new_ptl; struct mm_struct *mm = vma->vm_mm; pud_t pud; +#ifdef CONFIG_USER_REPLICATION + pud_t pud_numa[MAX_NUMNODES]; + bool old_pmd_replicated = numa_pgtable_replicated(pud_pgtable(*old_pud)); + bool new_pud_replicated = numa_pgtable_replicated(new_pud); +#endif if (!arch_supports_page_table_move()) return false; @@ -326,6 +371,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; +#ifdef CONFIG_USER_REPLICATION + /* + * In that case, we need to somehow get rid of page tables replicas of pmd level + * I am not sure how to do it properly right now, so fallback to slowpath + */ + if (old_pmd_replicated && !new_pud_replicated) + return false; +#endif + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -336,12 +390,37 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); /* Clear the pud */ +#ifdef CONFIG_USER_REPLICATION + if (old_pmd_replicated) { + int nid; + unsigned long offset; + struct ptdesc *curr; + pud_t *curr_pud; + bool start; + + for_each_pgtable(curr, curr_pud, old_pud, nid, offset, start) + pud_numa[nid] = *(curr_pud); + } +#endif pud = *old_pud; - pud_clear(old_pud); + pud_clear_replicated(old_pud); VM_BUG_ON(!pud_none(*new_pud)); - pud_populate(mm, new_pud, pud_pgtable(pud)); +#ifdef CONFIG_USER_REPLICATION + if (new_pud_replicated && old_pmd_replicated) { + int nid; + unsigned long offset; + struct ptdesc *curr; + pud_t *curr_pud; + bool start; + + for_each_pgtable(curr, curr_pud, new_pud, nid, offset, start) + pud_populate(mm, curr_pud, pud_pgtable(pud_numa[nid])); + } else +#endif + pud_populate_replicated(mm, new_pud, pud_pgtable(pud)); + flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); @@ -776,6 +855,12 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EINVAL); } + /* + * For simplicity, remap is not supported for has_replicas vmas right now + */ + if(vma_has_replicas(vma)) + return ERR_PTR(-EINVAL); + if ((flags & MREMAP_DONTUNMAP) && (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return ERR_PTR(-EINVAL); diff --git a/mm/numa_kernel_replication.c b/mm/numa_kernel_replication.c index 17664a1300b87..555be093bd794 100644 --- a/mm/numa_kernel_replication.c +++ b/mm/numa_kernel_replication.c @@ -625,6 +625,8 @@ static void replicate_pgtables(void) init_mm.pgd_numa[nid] = node_desc[memory_nid].pgd; } + + init_mm.pgd = init_mm.pgd_numa[first_memory_node]; } static void __init numa_replicate_kernel_text_disabled(void) @@ -742,7 +744,7 @@ static int __init setup_kernel_replication(char *str) __setup("kernel_replication=", setup_kernel_replication); -nodemask_t __ro_after_init replica_nodes = { { [0] = 1UL } }; +extern nodemask_t replica_nodes; /* * Let us pretend, that we have only single node fore replicas. diff --git a/mm/numa_user_replication.c b/mm/numa_user_replication.c new file mode 100644 index 0000000000000..d7540c1ae3e50 --- /dev/null +++ b/mm/numa_user_replication.c @@ -0,0 +1,1577 @@ +#include <linux/numa_user_replication.h> +#include <asm/tlb.h> + +#include "internal.h" + +static int pick_remaining_node(struct page *page, struct vm_area_struct *vma, + unsigned long addr) { + return mpol_misplaced(page_folio(page), vma, addr); +} + +static int phys_deduplicate_pte_entry(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + bool alloc_new_page, struct page **new_page) +{ + struct page *orig_page; + pte_t *pte, entry, orig_entry; + spinlock_t *ptl; + struct pgtable_private ptes; + int nid, orig_nid; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + entry = ptep_get(pte); + + if (!pte_present(entry)) { + pte_unmap_unlock(pte, ptl); + return 0; + } else { + struct page *page = vm_normal_page(vma, addr, entry); + BUG_ON(page && folio_test_large(page_folio(page))); + + if (!page || !PageReplicated(page)) { + pte_unmap_unlock(pte, ptl); + return 0; + } + } + + pgtable_update_pte(&ptes, pte); + for_each_memory_node(nid) { + ptes.replica_pages[nid] = vm_normal_page(vma, addr, ptep_get(ptes.pte_numa[nid])); + } + + if (alloc_new_page) { + orig_nid = NUMA_NO_NODE; + orig_page = *new_page; + *new_page = NULL; + } else { + orig_nid = pick_remaining_node(ptes.replica_pages[first_memory_node], vma, addr); + if (orig_nid == NUMA_NO_NODE) + orig_nid = first_memory_node; + + orig_page = ptes.replica_pages[orig_nid]; + } + + orig_entry = mk_pte(orig_page, vma->vm_page_prot); + + if (alloc_new_page) { + void* src_vaddr = page_to_virt(ptes.replica_pages[first_memory_node]); + void* new_vaddr = page_to_virt(orig_page); + copy_page(new_vaddr, src_vaddr); + + __SetPageUptodate(orig_page); + + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + add_reliable_page_counter(orig_page, vma->vm_mm, 1); + } else { + ClearPageReplicated(orig_page); + } + + folio_add_new_anon_rmap(page_folio(orig_page), vma, addr, RMAP_EXCLUSIVE); + folio_add_lru_vma(page_folio(orig_page), vma); + + for_each_memory_node(nid) { + if (nid == orig_nid) + continue; + + set_pte_at(vma->vm_mm, addr, ptes.pte_numa[nid], orig_entry); + tlb_remove_tlb_entry(tlb, ptes.pte_numa[nid], addr); + } + + pte_unmap_unlock(pte, ptl); + + for_each_memory_node(nid) { + if (nid == orig_nid) + continue; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + add_reliable_page_counter(ptes.replica_pages[nid], vma->vm_mm, -1); + + tlb_remove_page(tlb, ptes.replica_pages[nid]); + } + + return 0; +} + +static int prealloc_page_for_deduplication(struct vm_area_struct *vma, + unsigned long addr, bool alloc_new_page, struct page **page) +{ + struct folio *folio; + + if (!alloc_new_page || *page) + return 0; + + folio = vma_alloc_zeroed_movable_folio(vma, addr); + if (!folio) + return -ENOMEM; + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return -ENOMEM; + } + folio_throttle_swaprate(folio, GFP_KERNEL); + + *page = folio_page(folio, 0); + return 0; +} + +static int phys_deduplicate_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + unsigned long end, bool alloc_new_page) +{ + struct page *prealloc_page = NULL; + int error = 0; + + tlb_change_page_size(tlb, PAGE_SIZE); + + flush_tlb_batched_pending(vma->vm_mm); + arch_enter_lazy_mmu_mode(); + do { + error = prealloc_page_for_deduplication(vma, addr, alloc_new_page, &prealloc_page); + if (error) + break; + error = phys_deduplicate_pte_entry(tlb, vma, pmd, addr, alloc_new_page, &prealloc_page); + if (error) + break; + } while (addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + + if (prealloc_page) + put_page(prealloc_page); + + return error; +} + +static int phys_deduplicate_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, + unsigned long end, bool alloc_new_page) +{ + pmd_t *pmd; + unsigned long next; + int error = 0; + + pmd = pmd_offset(pud, addr); + do { + pmd_t _pmd; + + next = pmd_addr_end(addr, end); + + if (pmd_none(*pmd)) + goto next; + + _pmd = pmdp_get_lockless(pmd); + if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) + BUG(); // not supported right now, probably only trans_huge will be + + error = phys_deduplicate_pte_range(tlb, vma, pmd, addr, next, alloc_new_page); + if (error) + break; +next: + cond_resched(); + } while (pmd++, addr = next, addr != end); + + return error; +} + +static int phys_deduplicate_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, + unsigned long end, bool alloc_new_page) +{ + pud_t *pud; + unsigned long next; + int error = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + error = phys_deduplicate_pmd_range(tlb, vma, pud, addr, next, alloc_new_page); + if (error) + break; + } while (pud++, addr = next, addr != end); + + return error; +} + +static int phys_deduplicate_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, + unsigned long end, bool alloc_new_page) +{ + p4d_t *p4d; + unsigned long next; + int error = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + error = phys_deduplicate_pud_range(tlb, vma, p4d, addr, next, alloc_new_page); + if (error) + break; + } while (p4d++, addr = next, addr != end); + + return error; +} + +/* + * Pages inside [addr; end) are 100% populated, + * so we can't skip some checks and simplify code. + */ +static int phys_deduplicate_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long end, bool alloc_new_page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + int error = 0; + + if (addr == end) + return 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_pgd(this_node_pgd(mm), addr); + tlb_start_vma(tlb, vma); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + error = phys_deduplicate_p4d_range(tlb, vma, pgd, addr, next, alloc_new_page); + if (error) + break; + } while (pgd++, addr = next, addr != end); + + tlb_end_vma(tlb, vma); + + return error; +} + +static int numa_remove_replicas(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool alloc_new_page) +{ + int error; + + start = start & PAGE_MASK; + end = end & PAGE_MASK; + + error = phys_deduplicate_range(tlb, vma, start, end, alloc_new_page); +// if (!error && printk_ratelimit()) { +// pr_info("Deduplicated range: 0x%016lx --- 0x%016lx, mm: 0x%016lx, PID: %d name: %s\n", +// start, end, (unsigned long)(vma->vm_mm), vma->vm_mm->owner->pid, vma->vm_mm->owner->comm); +// } + BUG_ON(error && !alloc_new_page); + + return error; +} + +/* + * We must hold at least mmap_read_lock or vma_read_lock + */ +int phys_deduplicate(struct vm_area_struct *vma, unsigned long start, size_t len, + bool alloc_new_page) +{ + int error = 0; + struct mmu_gather tlb; + + if (!vma) { + pr_warn("%s -- %s:%d\n", __func__, __FILE__, __LINE__); + return -EINVAL; + } + + BUG_ON(vma_has_replicas(vma) && !vma_might_be_replicated(vma)); + if (!vma_has_replicas(vma)) { + pr_warn("%s -- %s:%d\n", __func__, __FILE__, __LINE__); + return -EINVAL; + } + + if ((start < vma->vm_start) || (start + len > vma->vm_end)) { + pr_warn("Deduplication is possible only inside vma\n"); + pr_warn("vma->vm_start %zx; len %zx\n", vma->vm_start, + vma->vm_end - vma->vm_start); + return -EINVAL; + } + + tlb_gather_mmu(&tlb, vma->vm_mm); + error = numa_remove_replicas(&tlb, vma, vma->vm_start, vma->vm_end, alloc_new_page); + tlb_finish_mmu(&tlb); + + return error; +} + +static int __fixup_fault(struct vm_area_struct *vma, unsigned long addr) { + return (handle_mm_fault(vma, addr, FAULT_FLAG_INTERRUPTIBLE | FAULT_FLAG_KILLABLE | + FAULT_FLAG_RETRY_NOWAIT | FAULT_FLAG_ALLOW_RETRY, NULL) & VM_FAULT_ERROR); +} + +static int fixup_fault(struct vm_area_struct *vma, unsigned long addr) { + vm_fault_t fault = __fixup_fault(vma, addr); + if (fault & VM_FAULT_SIGBUS) + return 0; + return !!(fault & VM_FAULT_ERROR); +} + +static int phys_duplicate_pte_entry(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + struct page **replica_pages) +{ + struct page *orig_page; + pte_t *pte, *orig_pte, orig_entry; + spinlock_t *ptl; + struct pgtable_private ptes; + int nid; + int reason = 0; +retry: + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + + if (!numa_pgtable_replicated(pte)) { + vm_fault_t fault; + pte_unmap_unlock(pte, ptl); + mmap_assert_locked(vma->vm_mm); + fault = __fixup_fault(vma, addr); + + if (fault & (VM_FAULT_SIGBUS | VM_FAULT_RETRY)) + return -EBUSY; + if(fault) + return -ENOMEM; + goto retry; + } + + pgtable_update_pte(&ptes, pte); + /* It could happen on a not yet faulted vaddr. Now we require from user to + * put MAP_POPULATE manually, but add it with MAP_REPLICA silently. + */ + orig_pte = ptes.pte_numa[first_memory_node]; + orig_entry = ptep_get(orig_pte); + + for_each_memory_node(nid) { + /* + * For some unknown reasons, there are cases, when + * pte_level populated only on single node. This is not good, + * to avoid this check all ptes now, but this should not happening at all + */ + if(!pte_present(ptep_get(ptes.pte_numa[nid]))) { + pte_unmap_unlock(pte, ptl); + return -EBUSY; + } + } + if (pte_write(orig_entry)) { + reason = 2; + goto bug; + } + + /* We can handle this case only for 0th node table (I hope so), + * because we are under pte_lock, which serializes migration pte modifications + */ + orig_page = vm_normal_page(vma, addr, orig_entry); + BUG_ON(orig_page && folio_test_large(page_folio(orig_page))); + + if (orig_page && PageReplicated(orig_page)) { + pte_unmap_unlock(pte, ptl); + return -EBUSY; + } + + for_each_memory_node(nid) { + struct page *src_page, *new_page; + void *src_vaddr, *new_vaddr; + pte_t *curr_pte, curr_entry, new_entry; + + curr_pte = ptes.pte_numa[nid]; + curr_entry = ptep_get(curr_pte); + src_page = pte_page(curr_entry); + + new_page = replica_pages[nid]; + replica_pages[nid] = NULL; + + new_vaddr = page_to_virt(new_page); + src_vaddr = page_to_virt(src_page); + copy_page(new_vaddr, src_vaddr); + + __SetPageUptodate(new_page); + + new_entry = mk_pte(new_page, vma->vm_page_prot); + + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + add_reliable_page_counter(new_page, vma->vm_mm, 1); + + set_pte_at(vma->vm_mm, addr, curr_pte, new_entry); + if (pte_needs_flush(curr_entry, new_entry)) + tlb_flush_pte_range(tlb, addr, PAGE_SIZE); + } + + if (orig_page) { + dec_mm_counter(vma->vm_mm, mm_counter(page_folio(orig_page))); + add_reliable_page_counter(orig_page, vma->vm_mm, -1); + folio_remove_rmap_pte(page_folio(orig_page), orig_page, vma); + } + + pte_unmap_unlock(pte, ptl); + + if (orig_page) { + free_pages_and_swap_cache((struct encoded_page **) &orig_page, 1); + } + + return 0; + +bug: + dump_mm_pgtables(vma->vm_mm, addr, addr + PAGE_SIZE * 4 - 1); + pr_info("Died because BUG_ON #%d\n", reason); + BUG(); +} + +static void release_prealloc_pages(struct page **pages) +{ + int nid; + for_each_memory_node(nid) { + if (pages[nid] != NULL) { + put_page(pages[nid]); + pages[nid] = NULL; + } + } +} + +static int prealloc_pages_for_replicas(struct mm_struct *mm, struct page **pages) +{ + int nid; + for_each_memory_node(nid) { + /* + * Do not reclaim in case of memory shortage, just fail + * We already don't have enough memory. + * Also, make replica pages unmovable + */ + pages[nid] = alloc_pages_node(nid, + (GFP_HIGHUSER | __GFP_THISNODE) & (~__GFP_DIRECT_RECLAIM), 0); + if (pages[nid] == NULL) + goto fail; + SetPageReplicated(pages[nid]); + if (mem_cgroup_charge(page_folio(pages[nid]), mm, GFP_KERNEL)) + goto fail; + } + + for_each_memory_node(nid) { + folio_throttle_swaprate(page_folio(pages[nid]), GFP_KERNEL); + } + + return 0; + +fail: + release_prealloc_pages(pages); + return - ENOMEM; +} + +/* + * We must hold at least mmap_read_lock or vma_read_lock + */ +unsigned long phys_duplicate_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + struct page *prealloc_pages[MAX_NUMNODES] = {}; + + tlb_change_page_size(tlb, PAGE_SIZE); + + flush_tlb_batched_pending(vma->vm_mm); + arch_enter_lazy_mmu_mode(); + do { + int ret = 0; + if (prealloc_pages_for_replicas(vma->vm_mm, prealloc_pages)) + break; + ret = phys_duplicate_pte_entry(tlb, vma, pmd, addr, prealloc_pages); + if (ret) + release_prealloc_pages(prealloc_pages); + if (ret == -ENOMEM) + break; + } while (addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + + return addr; +} + +static unsigned long phys_duplicate_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, + unsigned long end) +{ + pmd_t *pmd; + unsigned long next; +retry: + pmd = pmd_offset(pud, addr); + do { + pmd_t _pmd; + + next = pmd_addr_end(addr, end); + + if (pmd_none(*pmd) || !numa_pgtable_replicated(pmd)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + + _pmd = pmdp_get_lockless(pmd); + if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { + BUG(); // not supported right now, probably only trans_huge will be + } + + addr = phys_duplicate_pte_range(tlb, vma, pmd, addr, next); + if (addr != next) + break; + cond_resched(); + } while (pmd++, addr != end); + + return addr; +} + +static unsigned long phys_duplicate_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + pud_t *pud; + unsigned long next; +retry: + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if(pud_none_or_clear_bad(pud) || !numa_pgtable_replicated(pud)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + addr = phys_duplicate_pmd_range(tlb, vma, pud, addr, next); + if (addr != next) + break; + } while (pud++, addr != end); + + return addr; +} + +static unsigned long phys_duplicate_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + p4d_t *p4d; + unsigned long next; +retry: + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d) || !numa_pgtable_replicated(p4d)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + addr = phys_duplicate_pud_range(tlb, vma, p4d, addr, next); + if (addr != next) + break; + } while (p4d++, addr != end); + + return addr; +} + +static unsigned long phys_duplicate_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + + mmap_assert_locked(mm); + if (unlikely(anon_vma_prepare(vma))) + return addr; + + BUG_ON(addr >= end); + pgd = pgd_offset_pgd(this_node_pgd(mm), addr); + tlb_start_vma(tlb, vma); +retry: + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + addr = phys_duplicate_p4d_range(tlb, vma, pgd, addr, next); + if (addr != next) + break; + } while (pgd++, addr != end); + + tlb_end_vma(tlb, vma); + + return addr; + +} + +static int numa_clone_pte(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + unsigned long last = 0; + + start = start & PAGE_MASK; + end = end & PAGE_MASK; + + last = phys_duplicate_range(tlb, vma, start, end); + if (last != end) { + phys_deduplicate_range(tlb, vma, start, last, false); + return -ENOMEM; + } + + return 0; +} + +/* + * We must hold at least mmap_read_lock but not vma_read_lock + */ +int phys_duplicate(struct vm_area_struct *vma, unsigned long start, size_t len) +{ + int error = 0; + struct mmu_gather tlb; + + if (!vma) { + pr_warn("%s -- %s:%d\n", __func__, __FILE__, __LINE__); + return -EINVAL; + } + + BUG_ON(vma_has_replicas(vma) && !vma_might_be_replicated(vma)); + if (!vma_has_replicas(vma)) { + pr_warn("%s -- %s:%d\n", __func__, __FILE__, __LINE__); + return -EINVAL; + } + + if ((start < vma->vm_start) || (start + len > vma->vm_end)) { + pr_warn("Replication is possible only inside vma\n"); + pr_warn("vma->vm_start %zx; len %zx\n", vma->vm_start, + vma->vm_end - vma->vm_start); + return -EINVAL; + } + + tlb_gather_mmu(&tlb, vma->vm_mm); + error = numa_clone_pte(&tlb, vma, start, start + len); + tlb_finish_mmu(&tlb); + + return error; +} + +void numa_mm_handle_replication(struct mm_struct *mm, bool enable, fork_policy_t fork_policy) +{ + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, 0, 0); + + mmap_write_lock(mm); + + switch (fork_policy) { + case FORK_DISCARD_REPLICA: + if (enable) + mm->cg_user_replication_active = true; + else + BUG_ON(mm->cg_user_replication_active); + break; + case FORK_KEEP_REPLICA: + if (enable) { + mm->cg_user_replication_active = true; + mm->fork_policy = FORK_KEEP_REPLICA; + } else { + mm->fork_policy = FORK_DISCARD_REPLICA; + } + break; + case FORK_NO_REPLICA: + default: + BUG(); + } + + if (!enable) + goto out; + + mas_for_each(&mas, vma, ULONG_MAX) { + vm_flags_set(vma, VM_REPLICA_INIT); + if (vma_might_be_replicated(vma)) { + vm_flags_set(vma, VM_REPLICA_COMMIT); + phys_duplicate(vma, vma->vm_start, vma->vm_end - vma->vm_start); + } + } + +out: + mmap_write_unlock(mm); +} + +static inline bool replicated_p4d_level(struct vm_fault *vmf) +{ + //TODO Do something better + return mm_p4d_folded(vmf->vma->vm_mm) || vmf->p4d_replicated || pgd_none(*(vmf->pgd)); +} + +static inline bool replicated_pud_level(struct vm_fault *vmf) +{ + //TODO Do something better + /* We don't have entries on this level, or they are not the same*/ + return mm_pud_folded(vmf->vma->vm_mm) || vmf->pud_replicated || p4d_none(*(vmf->p4d)); +} + +static inline bool replicated_pmd_level(struct vm_fault *vmf) +{ + //TODO Do something better + /* We don't have entries on this level, or they are not the same*/ + return mm_pmd_folded(vmf->vma->vm_mm) || vmf->pmd_replicated || pud_none(*(vmf->pud)); +} + +static inline bool replicated_pte_level(struct vm_fault *vmf) +{ + //TODO Do something better + /* We don't have entries on this level, or they are not the same*/ + return vmf->pte_replicated || pmd_none(*(vmf->pmd)); +} + +static inline bool overlap_pmd_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & PMD_MASK) == (left & PMD_MASK)) || + ((address & PMD_MASK) == (right & PMD_MASK)); +} + +static inline bool overlap_pud_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & PUD_MASK) == (left & PUD_MASK)) || + ((address & PUD_MASK) == (right & PUD_MASK)); +} + +static inline bool overlap_p4d_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & P4D_MASK) == (left & P4D_MASK)) || + ((address & P4D_MASK) == (right & P4D_MASK)); +} + +static inline bool overlap_pgd_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & PGDIR_MASK) == (left & PGDIR_MASK)) || + ((address & PGDIR_MASK) == (right & PGDIR_MASK)); +} + +static inline void get_replicant_neighbours(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned long *left, unsigned long *right) +{ + *left = ULONG_MAX; + *right = ULONG_MAX; + + if (numa_is_vma_replicant(vma)) + *left = *right = address; +} + +static inline void __replication_path_action(struct vm_fault *vmf, bool replicated) +{ + if (vmf->replica_action != REPLICA_NONE) { + /* + * If we meet propagation action again, that means upper + * level has already been propagated and we don't have + * replicas anylower -- we need to completely switch + * to default handling. + */ + if (vmf->replica_action == REPLICA_PROPAGATE) + vmf->replica_action = REPLICA_NONE; + else + vmf->replica_action = replicated ? REPLICA_KEEP : REPLICA_PROPAGATE; + } +} + +static bool replication_path_pgd(struct vm_fault *vmf) +{ + bool p4d_folded = mm_p4d_folded(vmf->vma->vm_mm), replicated; + struct mm_struct *mm = vmf->vma->vm_mm; + unsigned long address = vmf->real_address; + /* There are replicated tables in our pgd entry or there is vma requiring it. Need to replicate next level. + * 5-level paging and folded p4d give us a lot of grief. + * If 5-level paging disabled, handle_mm_fault_pgd function doing nothing, except filling vmf->p4d_numa + * with same values as in vmf->pgd_numa and propagation will not work correctly. + * So we need to go in __handle_mm_fault_p4d_replicant, because we might still want to propagate it. + */ + get_replicant_neighbours(mm, vmf->vma, address, &(vmf->left_replicant), &(vmf->right_replicant)); + if (!p4d_folded) + vmf->p4d_replicated = !pgd_none(*(vmf->pgd)) && + PageReplicated(virt_to_page(pgd_page_vaddr(*vmf->pgd))); + + replicated = p4d_folded || overlap_pgd_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->p4d_replicated; + /* + * Here replica_action may be REPLICA_NONE, so we ignore that, + * because we always replicate top level table. + */ + + vmf->replica_action = replicated ? REPLICA_KEEP : REPLICA_PROPAGATE; + return replicated; +} + +static bool replication_path_p4d(struct vm_fault *vmf) +{ + bool pud_folded = mm_pud_folded(vmf->vma->vm_mm), replicated; + unsigned long address = vmf->real_address; + + if (vmf->replica_action == REPLICA_PROPAGATE) { + /* + * We have already propagated upper level, + * so we'll never use XXX_replicated values again + * during this fault. + */ + vmf->replica_action = REPLICA_NONE; + return false; + } + + if (!pud_folded) + vmf->pud_replicated = !p4d_none(*(vmf->p4d)) && + PageReplicated(virt_to_page(p4d_pgtable(*vmf->p4d))); + + replicated = pud_folded || overlap_p4d_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->pud_replicated; + + __replication_path_action(vmf, replicated); + return replicated; +} + +static bool replication_path_pud(struct vm_fault *vmf) +{ + bool pmd_folded = mm_pmd_folded(vmf->vma->vm_mm), replicated; + unsigned long address = vmf->real_address; + + if (vmf->replica_action == REPLICA_PROPAGATE) { + /* + * We have already propagated upper level, + * so we'll never use XXX_replicated values again + * during this fault. + */ + vmf->replica_action = REPLICA_NONE; + return false; + } + + if (!pmd_folded) + vmf->pmd_replicated = !pud_none(*(vmf->pud)) && + PageReplicated(virt_to_page(pud_pgtable(*vmf->pud))); + replicated = pmd_folded || overlap_pud_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->pmd_replicated; + __replication_path_action(vmf, replicated); + return replicated; +} + +static bool replication_path_pmd(struct vm_fault *vmf) +{ + bool replicated; + unsigned long address = vmf->real_address; + + if (vmf->replica_action == REPLICA_PROPAGATE) { + /* + * We have already propagated upper level, + * so we'll never use XXX_replicated values again + * during this fault. + */ + vmf->replica_action = REPLICA_NONE; + return false; + } + + vmf->pte_replicated = !pmd_none(*(vmf->pmd)) && + PageReplicated(pmd_pgtable(*vmf->pmd)); + replicated = overlap_pmd_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->pte_replicated; + __replication_path_action(vmf, replicated); + return replicated; +} + +static void +release_replicated_p4d_tables(int allocated_node, p4d_t **new, struct mm_struct *mm) +{ + int nid; + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + p4d_free(mm, new[nid]); + } +} + +static void +release_replicated_pud_tables(int allocated_node, pud_t **new, struct mm_struct *mm) +{ + int nid; + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + pud_free(mm, new[nid]); + } +} + +static void +release_replicated_pmd_tables(int allocated_node, pmd_t **new, struct mm_struct *mm) +{ + int nid; + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + pmd_free(mm, new[nid]); + } +} + +static void +release_replicated_pte_tables(int allocated_node, struct ptdesc **new, struct mm_struct *mm) +{ + int nid; + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + + if (allocated_node == NUMA_NO_NODE) { + ClearPageReplicated(ptdesc_page(new[nid])); + new[nid]->replica_list_node.next = NULL; + } + pte_free(mm, ptdesc_page(new[nid])); + } +} + +static void +sync_replicated_p4d_tables(int allocated_node, p4d_t **new, pgd_t *start_pgd, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct ptdesc *curr; + pgd_t *curr_pgd; + bool start; + + for_each_pgtable(curr, curr_pgd, start_pgd, nid, offset, start) { + SetPageReplicated(virt_to_page(new[nid])); + if (nid == allocated_node) + continue; + + if (allocated_node != NUMA_NO_NODE) + copy_page(new[nid], new[allocated_node]); + + smp_wmb(); + pgd_populate(mm, curr_pgd, new[nid]); + } +} + +static void +sync_replicated_pud_tables(int allocated_node, pud_t **new, p4d_t *start_p4d, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct ptdesc *curr; + p4d_t *curr_p4d; + bool start; + + /* + * Do not need locking from sync_replicated_pte_tables, + * because pud_lockptr == page_table_lock + */ + + build_pud_chain(new); + set_master_page_for_puds(allocated_node, new); + + for_each_pgtable(curr, curr_p4d, start_p4d, nid, offset, start) { + SetPageReplicated(virt_to_page(new[nid])); + if (nid == allocated_node) + continue; + + if (allocated_node != NUMA_NO_NODE) + copy_page(new[nid], new[allocated_node]); + + mm_inc_nr_puds(mm); + smp_wmb(); + p4d_populate(mm, curr_p4d, new[nid]); + } +} + +static void +sync_replicated_pmd_tables(int allocated_node, pmd_t **new, pud_t *start_pud, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct ptdesc *curr; + pud_t *curr_pud; + bool start; + + /* + * Locking here the same as in the sync_replicated_pte_tables + */ + spinlock_t *ptl = NULL; + if (allocated_node != NUMA_NO_NODE) { + ptl = pmd_lockptr(mm, new[allocated_node]); + spin_lock_nested(ptl, 1); + } + + BUILD_BUG_ON(!USE_SPLIT_PMD_PTLOCKS); + + build_pmd_chain(new); + set_master_page_for_pmds(allocated_node, new); + + for_each_pgtable(curr, curr_pud, start_pud, nid, offset, start) { + SetPageReplicated(virt_to_page(new[nid])); + if (nid == allocated_node) + continue; + if (allocated_node != NUMA_NO_NODE) + copy_page(new[nid], new[allocated_node]); + + mm_inc_nr_pmds(mm); + smp_wmb(); + pud_populate(mm, curr_pud, new[nid]); + } + + if (ptl) + spin_unlock(ptl); +} + +#ifdef CONFIG_ARM64 +static void +sync_replicated_pte_tables(int allocated_node, struct ptdesc **new, pmd_t *start_pmd, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct ptdesc *curr; + pmd_t *curr_pmd; + bool start; + spinlock_t *ptl = NULL; + + /* Why we need (sometimes) ptl from allocated_node here? + * If replicate existed table, concurrent page fault might + * observe replicated table which content was not copied + * from original table yet. At this point master_locks are + * already set (which is lock from original table), so we + * need to hold it here. + * + * Obviously, if there was no any table before, + * we do not need to hold any pte lock at all, everything will be propagated + * correctly via replica_list + */ + BUILD_BUG_ON(!USE_SPLIT_PTE_PTLOCKS); + + if (allocated_node != NUMA_NO_NODE) { + ptl = ptlock_ptr(new[allocated_node]); + spin_lock_nested(ptl, 1); + + build_pte_chain(new); + set_master_page_for_ptes(allocated_node, new); + + for_each_memory_node(nid) { + SetPageReplicated(ptdesc_page(new[nid])); + if (nid == allocated_node) + continue; + copy_page(ptdesc_to_virt(new[nid]), ptdesc_to_virt(new[allocated_node])); + } + } + + smp_wmb(); + + for_each_pgtable(curr, curr_pmd, start_pmd, nid, offset, start) { + /* + * We are safe to set this flag here even for original table, + * because replica list have already been created. + * So, in the case if some propagation will be required, + * we are able to do it, even if not all upper tables are populated yet + */ + if (nid == allocated_node) + continue; + + mm_inc_nr_ptes(mm); + + WRITE_ONCE(*curr_pmd, __pmd(__phys_to_pmd_val(page_to_phys(ptdesc_page(new[nid]))) | + PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_PXN)); + } + + dsb(ishst); + isb(); + + if (ptl) + spin_unlock(ptl); +} +#endif + +static int +prepare_replicated_p4d_tables(int allocated_node, p4d_t **new, struct mm_struct *mm, unsigned long address) +{ + int nid; + p4d_t *new_p4d; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_p4d = p4d_alloc_one_node(nid, mm, address); + + if (unlikely(!new_p4d)) + fail = true; + + new[nid] = new_p4d; + } + + if (unlikely(fail)) { + release_replicated_p4d_tables(allocated_node, new, mm); + return -ENOMEM; + } + + return 0; +} + +static int +prepare_replicated_pud_tables(int allocated_node, pud_t **new, struct mm_struct *mm, unsigned long address) +{ + int nid; + pud_t *new_pud; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_pud = pud_alloc_one_node(nid, mm, address); + + if (unlikely(!new_pud)) + fail = true; + + new[nid] = new_pud; + } + + if (unlikely(fail)) { + release_replicated_pud_tables(allocated_node, new, mm); + return -ENOMEM; + } + + return 0; +} + +static int +prepare_replicated_pmd_tables(int allocated_node, pmd_t **new, struct mm_struct *mm, unsigned long address) +{ + int nid; + pmd_t *new_pmd; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_pmd = pmd_alloc_one_node(nid, mm, address); + + if (unlikely(!new_pmd)) + fail = true; + + new[nid] = new_pmd; + } + + if (unlikely(fail)) { + release_replicated_pmd_tables(allocated_node, new, mm); + return -ENOMEM; + } + + return 0; +} + +static int +prepare_replicated_pte_tables(int allocated_node, struct ptdesc **new, struct mm_struct *mm) +{ + int nid; + struct page *new_pte; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_pte = pte_alloc_one_node(nid, mm); + + if (unlikely(!new_pte)) + fail = true; + + new[nid] = page_ptdesc(new_pte); + } + + if (unlikely(fail)) { + release_replicated_pte_tables(allocated_node, new, mm); + return -ENOMEM; + } + + if (allocated_node == NUMA_NO_NODE) { + build_pte_chain(new); + set_master_page_for_ptes(allocated_node, new); + + for_each_memory_node(nid) { + SetPageReplicated(ptdesc_page(new[nid])); + } + } + + return 0; +} + +static vm_fault_t replication_handle_pgd_fault(struct vm_fault *vmf) +{ + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + + vmf->pgd = pgd_offset_pgd(mm->pgd, address); + + return 0; +} + +/* TODO Need to clarify, how this going to work with and without 5-level paging*/ +static vm_fault_t replication_handle_p4d_fault(struct vm_fault *vmf) +{ + int ret; + p4d_t *p4d_tables[MAX_NUMNODES]; + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + + /* See replication_handle_pgd_fault in mm/numa_replication.c */ + if (replicated_p4d_level(vmf)) { + if (!pgd_none(*vmf->pgd)) { + vmf->p4d = p4d_offset(vmf->pgd, address); + return 0; + } + ret = prepare_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*vmf->pgd)) { + /* Someone else has replicated this level */ + BUG_ON(!PageReplicated(virt_to_page(pgd_page_vaddr(*(vmf->pgd))))); + release_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, mm); + } else + sync_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, vmf->pgd, mm); + spin_unlock(&mm->page_table_lock); + + } else { + p4d_t *table_page = (p4d_t *)pgd_page_vaddr(*(vmf->pgd)); + int p4d_node = page_to_nid(virt_to_page(table_page)); + + p4d_tables[p4d_node] = table_page; + ret = prepare_replicated_p4d_tables(p4d_node, p4d_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (PageReplicated(virt_to_page(table_page))) + /* Someone else has replicated this level */ + release_replicated_p4d_tables(p4d_node, p4d_tables, mm); + else + sync_replicated_p4d_tables(p4d_node, p4d_tables, vmf->pgd, mm); + spin_unlock(&mm->page_table_lock); + } + + vmf->p4d = p4d_offset(vmf->pgd, address); + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +static vm_fault_t replication_handle_pud_fault(struct vm_fault *vmf) +{ + int ret; + pud_t *pud_tables[MAX_NUMNODES]; + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + + /* See replication_handle_pgd_fault in mm/numa_replication.c */ + if (replicated_pud_level(vmf)) { + if (!p4d_none(*vmf->p4d)) { + vmf->pud = pud_offset(vmf->p4d, address); + return 0; + } + ret = prepare_replicated_pud_tables(NUMA_NO_NODE, pud_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (p4d_present(*vmf->p4d)) { + /* Someone else has replicated this level */ + BUG_ON(!PageReplicated(virt_to_page(p4d_pgtable(*(vmf->p4d))))); + release_replicated_pud_tables(NUMA_NO_NODE, pud_tables, mm); + } else + sync_replicated_pud_tables(NUMA_NO_NODE, pud_tables, vmf->p4d, mm); + spin_unlock(&mm->page_table_lock); + } else { + pud_t* table_page = p4d_pgtable(*(vmf->p4d)); + int pud_node = page_to_nid(virt_to_page(table_page)); + + pud_tables[pud_node] = table_page; + ret = prepare_replicated_pud_tables(pud_node, pud_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (PageReplicated(virt_to_page(table_page))) + /* Someone else has replicated this level */ + release_replicated_pud_tables(pud_node, pud_tables, mm); + else + sync_replicated_pud_tables(pud_node, pud_tables, vmf->p4d, mm); + spin_unlock(&mm->page_table_lock); + } + + vmf->pud = pud_offset(vmf->p4d, address); + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +static vm_fault_t replication_handle_pmd_fault(struct vm_fault *vmf) +{ + int ret; + pmd_t* pmd_tables[MAX_NUMNODES]; + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + spinlock_t *ptl; + + /* See replication_handle_pgd_fault in mm/numa_replication.c */ + if (replicated_pmd_level(vmf)) { + if (!pud_none(*vmf->pud)) { + vmf->pmd = pmd_offset(vmf->pud, address); + return 0; + } + ret = prepare_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, mm, address); + if (ret) + goto fault_oom; + + ptl = pud_lock(mm, vmf->pud); + if (pud_present(*vmf->pud)) { + /* Someone else has replicated this level */ + BUG_ON(!PageReplicated(virt_to_page(pud_pgtable(*(vmf->pud))))); + release_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, mm); + } else + sync_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, vmf->pud, mm); + spin_unlock(ptl); + } else { + pmd_t* table_page = pud_pgtable(*(vmf->pud)); + int pmd_node = page_to_nid(virt_to_page(table_page)); + + pmd_tables[pmd_node] = table_page; + ret = prepare_replicated_pmd_tables(pmd_node, pmd_tables, mm, address); + if (ret) + goto fault_oom; + + ptl = pud_lock(mm, vmf->pud); + if (PageReplicated(virt_to_page(table_page))) + /* Someone else has replicated this level */ + release_replicated_pmd_tables(pmd_node, pmd_tables, mm); + else + sync_replicated_pmd_tables(pmd_node, pmd_tables, vmf->pud, mm); + spin_unlock(ptl); + } + + vmf->pmd = pmd_offset(vmf->pud, address); + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +static vm_fault_t replication_handle_pte_fault(struct vm_fault *vmf) +{ + int ret; + struct mm_struct *mm = vmf->vma->vm_mm; + struct ptdesc* pte_tables[MAX_NUMNODES]; + spinlock_t *ptl; + + if (replicated_pte_level(vmf)) { + /* + * If pmd from 0th node populated and PageReplciated flag is set, + * we don't care whether other nodes are populated or not, + * beacause pgtable lists are already built and we can use them + */ + if (!pmd_none(*vmf->pmd)) + return 0; + ret = prepare_replicated_pte_tables(NUMA_NO_NODE, pte_tables, mm); + if (ret) + goto fault_oom; + ptl = pmd_lock(mm, vmf->pmd); + if (unlikely(pmd_present(*vmf->pmd))) { + /* Someone else has replicated this level */ + BUG_ON(!PageReplicated(pmd_pgtable(*(vmf->pmd)))); + spin_unlock(ptl); + release_replicated_pte_tables(NUMA_NO_NODE, pte_tables, mm); + } else { + sync_replicated_pte_tables(NUMA_NO_NODE, pte_tables, vmf->pmd, mm); + spin_unlock(ptl); + } + } else { + struct page* table_page = pmd_pgtable(*(vmf->pmd)); + int pte_node = page_to_nid(table_page); + + pte_tables[pte_node] = page_ptdesc(table_page); + ret = prepare_replicated_pte_tables(pte_node, pte_tables, mm); + if (ret) + goto fault_oom; + + ptl = pmd_lock(mm, vmf->pmd); + if (unlikely(PageReplicated(table_page))) { + spin_unlock(ptl); + /* Someone else has replicated this level */ + release_replicated_pte_tables(pte_node, pte_tables, mm); + } else { + sync_replicated_pte_tables(pte_node, pte_tables, vmf->pmd, mm); + spin_unlock(ptl); + } + } + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +pgd_t *fault_pgd_offset(struct vm_fault *vmf, unsigned long address) +{ + vmf->pgd = pgd_offset_pgd(this_node_pgd(vmf->vma->vm_mm), address); + return vmf->pgd; +} + +p4d_t *fault_p4d_alloc(struct vm_fault *vmf, struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (replication_path_pgd(vmf)) { + if (replication_handle_p4d_fault(vmf)) + return NULL; + } else { + vmf->p4d = p4d_alloc(mm, pgd, address); + } + + return vmf->p4d; +} + +pud_t *fault_pud_alloc(struct vm_fault *vmf, struct mm_struct *mm, p4d_t *p4d, unsigned long address) +{ + if (vmf->replica_action != REPLICA_NONE && replication_path_p4d(vmf)) { + if (replication_handle_pud_fault(vmf)) + return NULL; + } else { + vmf->pud = pud_alloc(mm, p4d, address); + } + return vmf->pud; +} + +pmd_t *fault_pmd_alloc(struct vm_fault *vmf, struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + if (vmf->replica_action != REPLICA_NONE && replication_path_pud(vmf)) { + if (replication_handle_pmd_fault(vmf)) + return NULL; + } else { + vmf->pmd = pmd_alloc(mm, pud, address); + } + return vmf->pmd; +} + +int fault_pte_alloc(struct vm_fault *vmf) +{ + if (vmf->replica_action != REPLICA_NONE && replication_path_pmd(vmf)) + return replication_handle_pte_fault(vmf); + return 0; +} + +pte_t *cpr_alloc_pte_map_lock(struct mm_struct *dst_mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd, spinlock_t **ptl) +{ + struct ptdesc *pte_tables[MAX_NUMNODES]; + struct page *src_pte = pmd_pgtable(*src_pmd); + spinlock_t *pmd_ptl; + + bool pte_replicated_src = numa_pgtable_replicated(page_to_virt(src_pte)); + bool pmd_replicated_dst = numa_pgtable_replicated(dst_pmd); + + if (memcg_replication_enabled(dst_mm) && pte_replicated_src && pmd_replicated_dst) { + if (!pmd_none(*dst_pmd)) { + return pte_offset_map_lock(dst_mm, dst_pmd, addr, ptl); + } + + if (prepare_replicated_pte_tables(NUMA_NO_NODE, pte_tables, dst_mm)) + return NULL; + + pmd_ptl = pmd_lock(dst_mm, dst_pmd); + sync_replicated_pte_tables(NUMA_NO_NODE, pte_tables, dst_pmd, dst_mm); + spin_unlock(pmd_ptl); + + return pte_offset_map_lock(dst_mm, dst_pmd, addr, ptl); + } + + return pte_alloc_map_lock(dst_mm, dst_pmd, addr, ptl); +} + +pmd_t *cpr_alloc_pmd(struct mm_struct *dst_mm, unsigned long addr, + pud_t *src_pud, pud_t *dst_pud) +{ + pmd_t *pmd_tables[MAX_NUMNODES]; + pmd_t *src_pmd = pud_pgtable(*src_pud); + spinlock_t *ptl; + + bool pmd_replicated_src = numa_pgtable_replicated(src_pmd); + bool pud_replicated_dst = numa_pgtable_replicated(dst_pud); + + if (memcg_replication_enabled(dst_mm) && pmd_replicated_src && pud_replicated_dst) { + if (!pud_none(*dst_pud)) { + return pmd_offset(dst_pud, addr); + } + + if (prepare_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, dst_mm, addr)) + return NULL; + + ptl = pud_lock(dst_mm, dst_pud); + sync_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, dst_pud, dst_mm); + spin_unlock(ptl); + + return pmd_offset(dst_pud, addr); + } + + return pmd_alloc(dst_mm, dst_pud, addr); +} + +pud_t *cpr_alloc_pud(struct mm_struct *dst_mm, unsigned long addr, + p4d_t *src_p4d, p4d_t *dst_p4d) +{ +#if CONFIG_PGTABLE_LEVELS >= 4 + pud_t *pud_tables[MAX_NUMNODES]; + pud_t *src_pud = p4d_pgtable(*src_p4d); + + bool pud_replicated_src = numa_pgtable_replicated(src_pud); + bool p4d_replicated_dst = numa_pgtable_replicated(dst_p4d); + + if (memcg_replication_enabled(dst_mm) && pud_replicated_src && p4d_replicated_dst) { + if (!p4d_none(*dst_p4d)) { + return pud_offset(dst_p4d, addr); + } + + if (prepare_replicated_pud_tables(NUMA_NO_NODE, pud_tables, dst_mm, addr)) + return NULL; + + spin_lock(&dst_mm->page_table_lock); + sync_replicated_pud_tables(NUMA_NO_NODE, pud_tables, dst_p4d, dst_mm); + spin_unlock(&dst_mm->page_table_lock); + + return pud_offset(dst_p4d, addr); + } + + return pud_alloc(dst_mm, dst_p4d, addr); +#else + return pud_offset(dst_p4d, addr); +#endif +} + +p4d_t *cpr_alloc_p4d(struct mm_struct *dst_mm, unsigned long addr, + pgd_t *src_pgd, pgd_t *dst_pgd) +{ +#if CONFIG_PGTABLE_LEVELS == 5 + p4d_t *p4d_tables[MAX_NUMNODES]; + p4d_t *src_p4d = pgd_pgtable(*src_pgd); + + bool p4d_replicated_src = numa_pgtable_replicated(src_p4d); + + if (memcg_replication_enabled(dst_mm) && p4d_replicated_src) { + if (!pgd_none(*dst_pgd)) { + return p4d_offset(dst_pgd, addr); + } + if (prepare_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, dst_mm, addr)) + return NULL; + + spin_lock(&dst_mm->page_table_lock); + sync_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, dst_pgd, dst_mm); + spin_unlock(&dst_mm->page_table_lock); + + return p4d_offset(dst_pgd, addr); + } + + return p4d_alloc(dst_mm, dst_pgd, addr); +#else + return p4d_offset(dst_pgd, addr); +#endif +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2b1191efa284..146d9276edf4d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -205,6 +205,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); +nodemask_t __read_mostly replica_nodes = { { [0] = 1UL } }; + gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; /* @@ -3505,7 +3507,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * * The OOM killer may not free memory on a specific node. */ - if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + if ((gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) && + !(gfp_mask & __GFP_MAYDIE)) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->highest_zoneidx < ZONE_NORMAL) @@ -4731,7 +4734,8 @@ static inline bool check_after_alloc(gfp_t *gfp, unsigned int order, return true; } - if (*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + if ((*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) && + !(*gfp & __GFP_MAYDIE)) goto out; /* Coredumps can quickly deplete all memory reserves */ diff --git a/mm/page_idle.c b/mm/page_idle.c index 41ea77f22011e..cebc1897b12e3 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -12,6 +12,7 @@ #include <linux/mmu_notifier.h> #include <linux/page_ext.h> #include <linux/page_idle.h> +#include <linux/numa_user_replication.h> #include "internal.h" @@ -63,7 +64,7 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, * For PTE-mapped THP, one sub page is referenced, * the whole THP is referenced. */ - if (ptep_clear_young_notify(vma, addr, pvmw.pte)) + if (ptep_clear_young_notify_replicated(vma, addr, pvmw.pte)) referenced = true; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 74d2de15fb5e0..995db39afa955 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -4,6 +4,7 @@ #include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/numa_user_replication.h> #include "internal.h" @@ -211,7 +212,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) goto next_pte; restart: do { - pgd = pgd_offset(mm, pvmw->address); + pgd = pgd_offset_pgd(this_node_pgd(mm), pvmw->address); if (!pgd_present(*pgd)) { step_forward(pvmw, PGDIR_SIZE); continue; diff --git a/mm/rmap.c b/mm/rmap.c index 86353d274d437..f072b411219ef 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -75,6 +75,7 @@ #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/mm_inline.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> @@ -843,7 +844,7 @@ static bool folio_referenced_one(struct folio *folio, referenced++; } - if (ptep_clear_flush_young_notify(vma, address, + if (ptep_clear_flush_young_notify_replicated(vma, address, pvmw.pte)) referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { @@ -992,10 +993,10 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) continue; flush_cache_page(vma, address, pte_pfn(entry)); - entry = ptep_clear_flush(vma, address, pte); + entry = ptep_clear_flush_replicated(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); - set_pte_at(vma->vm_mm, address, pte, entry); + set_pte_at_replicated(vma->vm_mm, address, pte, entry); ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1744,11 +1745,11 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ - pteval = ptep_get_and_clear(mm, address, pvmw.pte); + pteval = ptep_get_and_clear_replicated(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address); } else { - pteval = ptep_clear_flush(vma, address, pvmw.pte); + pteval = ptep_clear_flush_replicated(vma, address, pvmw.pte); } } @@ -1775,7 +1776,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } else { dec_mm_counter(mm, mm_counter(folio)); add_reliable_folio_counter(folio, mm, -1); - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { @@ -1839,18 +1840,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * If the folio was redirtied, it cannot be * discarded. Remap the page to page table. */ - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); folio_set_swapbacked(folio); goto walk_abort; } if (swap_duplicate(entry) < 0) { - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); goto walk_abort; } if (arch_unmap_one(mm, vma, address, pteval) < 0) { swap_free(entry); - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -1858,7 +1859,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { swap_free(entry); - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); goto walk_abort; } if (list_empty(&mm->mmlist)) { @@ -1877,7 +1878,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, address, pvmw.pte, swp_pte); + set_pte_at_replicated(mm, address, pvmw.pte, swp_pte); } else { /* * This is a locked file-backed folio, @@ -2115,11 +2116,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ - pteval = ptep_get_and_clear(mm, address, pvmw.pte); + pteval = ptep_get_and_clear_replicated(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address); } else { - pteval = ptep_clear_flush(vma, address, pvmw.pte); + pteval = ptep_clear_flush_replicated(vma, address, pvmw.pte); } } @@ -2161,7 +2162,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + set_pte_at_replicated(mm, pvmw.address, pvmw.pte, swp_pte); trace_set_migration_pte(pvmw.address, pte_val(swp_pte), compound_order(&folio->page)); /* @@ -2177,7 +2178,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } else { dec_mm_counter(mm, mm_counter(folio)); add_reliable_folio_counter(folio, mm, -1); - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { @@ -2202,7 +2203,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); else - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -2222,7 +2223,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } } else if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -2255,7 +2256,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, swp_pte, hsz); else - set_pte_at(mm, address, pvmw.pte, swp_pte); + set_pte_at_replicated(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), compound_order(&folio->page)); /* @@ -2369,7 +2370,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(ptent)); - pteval = ptep_clear_flush(vma, address, pvmw.pte); + pteval = ptep_clear_flush_replicated(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) @@ -2400,7 +2401,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, address, pvmw.pte, swp_pte); + set_pte_at_replicated(mm, address, pvmw.pte, swp_pte); /* * There is a reference on the page for the swap entry which has diff --git a/mm/swap.c b/mm/swap.c index 9bc530395949b..7dcdfbf6913ed 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -114,6 +114,8 @@ static void page_cache_release(struct folio *folio) void __folio_put(struct folio *folio) { + ClearPageReplicated(folio_page(folio, 0)); + if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; @@ -1000,13 +1002,16 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) } if (put_devmap_managed_page_refs(&folio->page, nr_refs)) continue; - if (folio_ref_sub_and_test(folio, nr_refs)) + if (folio_ref_sub_and_test(folio, nr_refs)) { + ClearPageReplicated(folio_page(folio, 0)); free_zone_device_folio(folio); + } continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; + ClearPageReplicated(folio_page(folio, 0)); /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 3af5b6ebb2412..4d87dcd43719f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -42,6 +42,7 @@ #include <linux/completion.h> #include <linux/suspend.h> #include <linux/zswap.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> #include <linux/swapops.h> @@ -2015,7 +2016,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: - set_pte_at(vma->vm_mm, addr, pte, new_pte); + set_pte_at_replicated(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: if (pte) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 2c71a269a52c6..e25964e4d4700 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -16,6 +16,7 @@ #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/userswap.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" @@ -127,7 +128,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, */ inc_mm_counter(dst_mm, mm_counter(folio)); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at_replicated(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); @@ -270,7 +271,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, ret = -EEXIST; if (!pte_none(ptep_get(dst_pte))) goto out_unlock; - set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at_replicated(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; @@ -351,7 +352,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, if (!pte_none(ptep_get(dst_pte))) goto out_unlock; - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at_replicated(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); diff --git a/mm/userswap.c b/mm/userswap.c index 22e3f147ce5f9..749e6bd940853 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -11,6 +11,7 @@ #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> #include <linux/userswap.h> +#include <linux/numa_user_replication.h> #include "internal.h" @@ -154,11 +155,11 @@ static int uswap_unmap_anon_page(struct mm_struct *mm, goto out_release_unlock; } flush_cache_page(vma, addr, pte_pfn(*pte)); - _old_pte = ptep_clear_flush(vma, addr, pte); + _old_pte = ptep_clear_flush_replicated(vma, addr, pte); if (old_pte) *old_pte = _old_pte; if (set_to_swp) - set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( + set_pte_at_replicated(mm, addr, pte, swp_entry_to_pte(swp_entry( SWP_USERSWAP_ENTRY, page_to_pfn(page)))); dec_mm_counter(mm, MM_ANONPAGES); @@ -198,7 +199,7 @@ static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, dst_pte = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); - set_pte_at(mm, addr, pte, dst_pte); + set_pte_at_replicated(mm, addr, pte, dst_pte); out_unlock: pte_unmap_unlock(pte, ptl); @@ -217,7 +218,7 @@ static void uswap_map_anon_page(struct mm_struct *mm, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); flush_cache_page(vma, addr, pte_pfn(*pte)); - set_pte_at(mm, addr, pte, old_pte); + set_pte_at_replicated(mm, addr, pte, old_pte); inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); folio_add_new_anon_rmap(page_folio(page), vma, addr, RMAP_EXCLUSIVE); @@ -536,7 +537,7 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); folio_add_new_anon_rmap(page_folio(page), dst_vma, dst_addr, RMAP_EXCLUSIVE); - set_pte_at(mm, dst_addr, pte, dst_pte); + set_pte_at_replicated(mm, dst_addr, pte, dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, pte); -- 2.34.1