Anshuman Khandual (1): mm/ioremap: probe platform for p4d huge map support
Christoph Hellwig (8): mm: remove __get_vm_area mm: unexport unmap_kernel_range_noflush mm: only allow page table mappings for built-in zsmalloc mm: pass addr as unsigned long to vb_free mm: remove vmap_page_range_noflush and vunmap_page_range mm: rename vmap_page_range to map_kernel_range mm: don't return the number of pages from map_kernel_range{, _noflush} mm: remove map_vm_range
Daniel Axtens (1): mm/memory.c: add apply_to_existing_page_range() helper
Ingo Molnar (1): mm/vmalloc: Add empty <asm/vmalloc.h> headers and use them from <linux/vmalloc.h>
Mike Rapoport (1): mm: move lib/ioremap.c to mm/
Nicholas Piggin (9): mm/vmalloc: fix vmalloc_to_page for huge vmap mappings mm: apply_to_pte_range warn and fail if a large pte is encountered mm/vmalloc: rename vmap_*_range vmap_pages_*_range mm/ioremap: rename ioremap_*_range to vmap_*_range mm: HUGE_VMAP arch support cleanup arm64: inline huge vmap supported functions mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c mm/vmalloc: add vmap_range_noflush variant mm/vmalloc: Hugepage vmalloc mappings
Steven Price (2): mm: add generic p?d_leaf() macros arm64: mm: add p?d_leaf() definitions
Will Deacon (3): ioremap: rework pXd_free_pYd_page() API lib/ioremap: ensure phys_addr actually corresponds to a physical address lib/ioremap: ensure break-before-make is used for huge p4d mappings
Documentation/core-api/cachetlb.rst | 2 +- arch/Kconfig | 4 + arch/alpha/include/asm/vmalloc.h | 4 + arch/arc/include/asm/vmalloc.h | 4 + arch/arm/include/asm/vmalloc.h | 4 + arch/arm64/include/asm/pgtable.h | 2 + arch/arm64/include/asm/vmalloc.h | 29 ++ arch/arm64/mm/mmu.c | 16 - arch/c6x/include/asm/vmalloc.h | 4 + arch/csky/include/asm/vmalloc.h | 4 + arch/h8300/include/asm/vmalloc.h | 4 + arch/hexagon/include/asm/vmalloc.h | 4 + arch/ia64/include/asm/vmalloc.h | 4 + arch/m68k/include/asm/vmalloc.h | 4 + arch/microblaze/include/asm/vmalloc.h | 4 + arch/mips/include/asm/vmalloc.h | 4 + arch/nds32/include/asm/vmalloc.h | 4 + arch/nios2/include/asm/vmalloc.h | 4 + arch/openrisc/include/asm/vmalloc.h | 4 + arch/parisc/include/asm/vmalloc.h | 4 + arch/powerpc/include/asm/vmalloc.h | 12 + arch/powerpc/kernel/pci_64.c | 3 +- arch/riscv/include/asm/vmalloc.h | 4 + arch/s390/include/asm/vmalloc.h | 4 + arch/sh/include/asm/vmalloc.h | 4 + arch/sh/kernel/cpu/sh4/sq.c | 3 +- arch/sparc/include/asm/vmalloc.h | 4 + arch/um/include/asm/vmalloc.h | 4 + arch/unicore32/include/asm/vmalloc.h | 4 + arch/x86/include/asm/vmalloc.h | 12 + arch/x86/mm/ioremap.c | 11 +- arch/x86/mm/pgtable.c | 8 + arch/xtensa/include/asm/vmalloc.h | 4 + include/asm-generic/pgtable.h | 25 ++ include/linux/io.h | 8 - include/linux/mm.h | 3 + include/linux/vmalloc.h | 24 +- init/main.c | 1 - kernel/dma/mapping.c | 3 +- lib/Makefile | 1 - lib/ioremap.c | 183 --------- mm/Kconfig | 2 +- mm/Makefile | 2 +- mm/ioremap.c | 34 ++ mm/memory.c | 152 ++++++-- mm/vmalloc.c | 518 +++++++++++++++++++------- mm/zsmalloc.c | 4 +- 47 files changed, 752 insertions(+), 398 deletions(-) create mode 100644 arch/alpha/include/asm/vmalloc.h create mode 100644 arch/arc/include/asm/vmalloc.h create mode 100644 arch/arm/include/asm/vmalloc.h create mode 100644 arch/arm64/include/asm/vmalloc.h create mode 100644 arch/c6x/include/asm/vmalloc.h create mode 100644 arch/csky/include/asm/vmalloc.h create mode 100644 arch/h8300/include/asm/vmalloc.h create mode 100644 arch/hexagon/include/asm/vmalloc.h create mode 100644 arch/ia64/include/asm/vmalloc.h create mode 100644 arch/m68k/include/asm/vmalloc.h create mode 100644 arch/microblaze/include/asm/vmalloc.h create mode 100644 arch/mips/include/asm/vmalloc.h create mode 100644 arch/nds32/include/asm/vmalloc.h create mode 100644 arch/nios2/include/asm/vmalloc.h create mode 100644 arch/openrisc/include/asm/vmalloc.h create mode 100644 arch/parisc/include/asm/vmalloc.h create mode 100644 arch/powerpc/include/asm/vmalloc.h create mode 100644 arch/riscv/include/asm/vmalloc.h create mode 100644 arch/s390/include/asm/vmalloc.h create mode 100644 arch/sh/include/asm/vmalloc.h create mode 100644 arch/sparc/include/asm/vmalloc.h create mode 100644 arch/um/include/asm/vmalloc.h create mode 100644 arch/unicore32/include/asm/vmalloc.h create mode 100644 arch/x86/include/asm/vmalloc.h create mode 100644 arch/xtensa/include/asm/vmalloc.h delete mode 100644 lib/ioremap.c create mode 100644 mm/ioremap.c
From: Will Deacon will.deacon@arm.com
mainline inclusion from mainline-v5.0-rc1 commit d239865ac804c91a621294ca7bece4241b006fae category: feature bugzilla: NA CVE: NA
---------------------------
The recently merged API for ensuring break-before-make on page-table entries when installing huge mappings in the vmalloc/ioremap region is fairly counter-intuitive, resulting in the arch freeing functions (e.g. pmd_free_pte_page()) being called even on entries that aren't present. This resulted in a minor bug in the arm64 implementation, giving rise to spurious VM_WARN messages.
This patch moves the pXd_present() checks out into the core code, refactoring the callsites at the same time so that we avoid the complex conjunctions when determining whether or not we can put down a huge mapping.
Link: http://lkml.kernel.org/r/1544120495-17438-2-git-send-email-will.deacon@arm.c... Signed-off-by: Will Deacon will.deacon@arm.com Reviewed-by: Toshi Kani toshi.kani@hpe.com Suggested-by: Linus Torvalds torvalds@linux-foundation.org Cc: Chintan Pandya cpandya@codeaurora.org Cc: Toshi Kani toshi.kani@hpe.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Michal Hocko mhocko@suse.com Cc: "H. Peter Anvin" hpa@zytor.com Cc: Ingo Molnar mingo@elte.hu Cc: Sean Christopherson sean.j.christopherson@intel.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- lib/ioremap.c | 56 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 14 deletions(-)
diff --git a/lib/ioremap.c b/lib/ioremap.c index 517f5853ffed..6c72764af19c 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -76,6 +76,25 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, return 0; }
+static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, + pgprot_t prot) +{ + if (!ioremap_pmd_enabled()) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { @@ -89,13 +108,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end);
- if (ioremap_pmd_enabled() && - ((next - addr) == PMD_SIZE) && - IS_ALIGNED(phys_addr + addr, PMD_SIZE) && - pmd_free_pte_page(pmd, addr)) { - if (pmd_set_huge(pmd, phys_addr + addr, prot)) - continue; - } + if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot)) + continue;
if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) return -ENOMEM; @@ -103,6 +117,25 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, return 0; }
+static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, + pgprot_t prot) +{ + if (!ioremap_pud_enabled()) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { @@ -116,13 +149,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end);
- if (ioremap_pud_enabled() && - ((next - addr) == PUD_SIZE) && - IS_ALIGNED(phys_addr + addr, PUD_SIZE) && - pud_free_pmd_page(pud, addr)) { - if (pud_set_huge(pud, phys_addr + addr, prot)) - continue; - } + if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot)) + continue;
if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) return -ENOMEM;
From: Will Deacon will.deacon@arm.com
mainline inclusion from mainline-v5.0-rc1 commit 36ddc5a78c878e9b10c323d2fe88b0dae2f487eb category: feature bugzilla: NA CVE: NA
---------------------------
The current ioremap() code uses a phys_addr variable at each level of page table, which is confusingly offset by subtracting the base virtual address being mapped so that adding the current virtual address back on when iterating through the page table entries gives back the corresponding physical address.
This is fairly confusing and results in all users of phys_addr having to add the current virtual address back on. Instead, this patch just updates phys_addr when iterating over the page table entries, ensuring that it's always up-to-date and doesn't require explicit offsetting.
Link: http://lkml.kernel.org/r/1544120495-17438-5-git-send-email-will.deacon@arm.c... Signed-off-by: Will Deacon will.deacon@arm.com Tested-by: Sean Christopherson sean.j.christopherson@intel.com Reviewed-by: Sean Christopherson sean.j.christopherson@intel.com Cc: Chintan Pandya cpandya@codeaurora.org Cc: Toshi Kani toshi.kani@hpe.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Michal Hocko mhocko@suse.com Cc: Sean Christopherson sean.j.christopherson@intel.com Cc: "H. Peter Anvin" hpa@zytor.com Cc: Ingo Molnar mingo@elte.hu Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- lib/ioremap.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/lib/ioremap.c b/lib/ioremap.c index 6c72764af19c..10d7c5485c39 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -101,19 +101,18 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, pmd_t *pmd; unsigned long next;
- phys_addr -= addr; pmd = pmd_alloc(&init_mm, pud, addr); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end);
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot)) + if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) continue;
- if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) + if (ioremap_pte_range(pmd, addr, next, phys_addr, prot)) return -ENOMEM; - } while (pmd++, addr = next, addr != end); + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
@@ -142,19 +141,18 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, pud_t *pud; unsigned long next;
- phys_addr -= addr; pud = pud_alloc(&init_mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end);
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot)) + if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) continue;
- if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) + if (ioremap_pmd_range(pud, addr, next, phys_addr, prot)) return -ENOMEM; - } while (pud++, addr = next, addr != end); + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
@@ -164,7 +162,6 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, p4d_t *p4d; unsigned long next;
- phys_addr -= addr; p4d = p4d_alloc(&init_mm, pgd, addr); if (!p4d) return -ENOMEM; @@ -173,14 +170,14 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
if (ioremap_p4d_enabled() && ((next - addr) == P4D_SIZE) && - IS_ALIGNED(phys_addr + addr, P4D_SIZE)) { - if (p4d_set_huge(p4d, phys_addr + addr, prot)) + IS_ALIGNED(phys_addr, P4D_SIZE)) { + if (p4d_set_huge(p4d, phys_addr, prot)) continue; }
- if (ioremap_pud_range(p4d, addr, next, phys_addr + addr, prot)) + if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) return -ENOMEM; - } while (p4d++, addr = next, addr != end); + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
@@ -196,14 +193,13 @@ int ioremap_page_range(unsigned long addr, BUG_ON(addr >= end);
start = addr; - phys_addr -= addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr+addr, prot); + err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot); if (err) break; - } while (pgd++, addr = next, addr != end); + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
flush_cache_vmap(start, end);
From: Will Deacon will.deacon@arm.com
mainline inclusion from mainline-v5.0-rc1 commit 8e2d43405b22e98cf5f3730c1829ec1fdbe17ae7 category: feature bugzilla: NA CVE: NA
---------------------------
Whilst no architectures actually enable support for huge p4d mappings in the vmap area, the code that is implemented should be using break-before-make, as we do for pud and pmd huge entries.
Link: http://lkml.kernel.org/r/1544120495-17438-6-git-send-email-will.deacon@arm.c... Signed-off-by: Will Deacon will.deacon@arm.com Reviewed-by: Toshi Kani toshi.kani@hpe.com Cc: Chintan Pandya cpandya@codeaurora.org Cc: Toshi Kani toshi.kani@hpe.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Michal Hocko mhocko@suse.com Cc: "H. Peter Anvin" hpa@zytor.com Cc: Ingo Molnar mingo@elte.hu Cc: Sean Christopherson sean.j.christopherson@intel.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/mm/mmu.c | 5 +++++ arch/x86/mm/pgtable.c | 8 ++++++++ include/asm-generic/pgtable.h | 5 +++++ lib/ioremap.c | 27 +++++++++++++++++++++------ 4 files changed, 39 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 5673db3b4cc4..99a86326a301 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1036,6 +1036,11 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) return 1; }
+int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; /* Don't attempt a block mapping */ +} + #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index bf52106ab9c4..5b97717b389b 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -794,6 +794,14 @@ int pmd_clear_huge(pmd_t *pmd) return 0; }
+/* + * Until we support 512GB pages, skip them in the vmap area. + */ +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; +} + #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b96231975552..a7912c229033 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1059,6 +1059,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ @@ -1086,6 +1087,10 @@ static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } +static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; +} static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; diff --git a/lib/ioremap.c b/lib/ioremap.c index 10d7c5485c39..063213685563 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -156,6 +156,25 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, return 0; }
+static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, + pgprot_t prot) +{ + if (!ioremap_p4d_enabled()) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { @@ -168,12 +187,8 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end);
- if (ioremap_p4d_enabled() && - ((next - addr) == P4D_SIZE) && - IS_ALIGNED(phys_addr, P4D_SIZE)) { - if (p4d_set_huge(p4d, phys_addr, prot)) - continue; - } + if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) + continue;
if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) return -ENOMEM;
From: Ingo Molnar mingo@kernel.org
mainline inclusion from mainline v5.6-rc1 commit 1f059dfdf5d170dccbac92193be2fee3c1763384 category: feature bugzilla: NA CVE: NA
---------------------------
In the x86 MM code we'd like to untangle various types of historic header dependency spaghetti, but for this we'd need to pass to the generic vmalloc code various vmalloc related defines that customarily come via the <asm/page.h> low level arch header.
Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/alpha/include/asm/vmalloc.h | 4 ++++ arch/arc/include/asm/vmalloc.h | 4 ++++ arch/arm/include/asm/vmalloc.h | 4 ++++ arch/arm64/include/asm/vmalloc.h | 4 ++++ arch/c6x/include/asm/vmalloc.h | 4 ++++ arch/csky/include/asm/vmalloc.h | 4 ++++ arch/h8300/include/asm/vmalloc.h | 4 ++++ arch/hexagon/include/asm/vmalloc.h | 4 ++++ arch/ia64/include/asm/vmalloc.h | 4 ++++ arch/m68k/include/asm/vmalloc.h | 4 ++++ arch/microblaze/include/asm/vmalloc.h | 4 ++++ arch/mips/include/asm/vmalloc.h | 4 ++++ arch/nds32/include/asm/vmalloc.h | 4 ++++ arch/nios2/include/asm/vmalloc.h | 4 ++++ arch/openrisc/include/asm/vmalloc.h | 4 ++++ arch/parisc/include/asm/vmalloc.h | 4 ++++ arch/powerpc/include/asm/vmalloc.h | 4 ++++ arch/riscv/include/asm/vmalloc.h | 4 ++++ arch/s390/include/asm/vmalloc.h | 4 ++++ arch/sh/include/asm/vmalloc.h | 4 ++++ arch/sparc/include/asm/vmalloc.h | 4 ++++ arch/um/include/asm/vmalloc.h | 4 ++++ arch/unicore32/include/asm/vmalloc.h | 4 ++++ arch/x86/include/asm/vmalloc.h | 4 ++++ arch/xtensa/include/asm/vmalloc.h | 4 ++++ include/linux/vmalloc.h | 2 ++ 26 files changed, 102 insertions(+) create mode 100644 arch/alpha/include/asm/vmalloc.h create mode 100644 arch/arc/include/asm/vmalloc.h create mode 100644 arch/arm/include/asm/vmalloc.h create mode 100644 arch/arm64/include/asm/vmalloc.h create mode 100644 arch/c6x/include/asm/vmalloc.h create mode 100644 arch/csky/include/asm/vmalloc.h create mode 100644 arch/h8300/include/asm/vmalloc.h create mode 100644 arch/hexagon/include/asm/vmalloc.h create mode 100644 arch/ia64/include/asm/vmalloc.h create mode 100644 arch/m68k/include/asm/vmalloc.h create mode 100644 arch/microblaze/include/asm/vmalloc.h create mode 100644 arch/mips/include/asm/vmalloc.h create mode 100644 arch/nds32/include/asm/vmalloc.h create mode 100644 arch/nios2/include/asm/vmalloc.h create mode 100644 arch/openrisc/include/asm/vmalloc.h create mode 100644 arch/parisc/include/asm/vmalloc.h create mode 100644 arch/powerpc/include/asm/vmalloc.h create mode 100644 arch/riscv/include/asm/vmalloc.h create mode 100644 arch/s390/include/asm/vmalloc.h create mode 100644 arch/sh/include/asm/vmalloc.h create mode 100644 arch/sparc/include/asm/vmalloc.h create mode 100644 arch/um/include/asm/vmalloc.h create mode 100644 arch/unicore32/include/asm/vmalloc.h create mode 100644 arch/x86/include/asm/vmalloc.h create mode 100644 arch/xtensa/include/asm/vmalloc.h
diff --git a/arch/alpha/include/asm/vmalloc.h b/arch/alpha/include/asm/vmalloc.h new file mode 100644 index 000000000000..0a9a366a4d34 --- /dev/null +++ b/arch/alpha/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_ALPHA_VMALLOC_H +#define _ASM_ALPHA_VMALLOC_H + +#endif /* _ASM_ALPHA_VMALLOC_H */ diff --git a/arch/arc/include/asm/vmalloc.h b/arch/arc/include/asm/vmalloc.h new file mode 100644 index 000000000000..973095aad665 --- /dev/null +++ b/arch/arc/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_ARC_VMALLOC_H +#define _ASM_ARC_VMALLOC_H + +#endif /* _ASM_ARC_VMALLOC_H */ diff --git a/arch/arm/include/asm/vmalloc.h b/arch/arm/include/asm/vmalloc.h new file mode 100644 index 000000000000..a9b3718b8600 --- /dev/null +++ b/arch/arm/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_ARM_VMALLOC_H +#define _ASM_ARM_VMALLOC_H + +#endif /* _ASM_ARM_VMALLOC_H */ diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h new file mode 100644 index 000000000000..2ca708ab9b20 --- /dev/null +++ b/arch/arm64/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_ARM64_VMALLOC_H +#define _ASM_ARM64_VMALLOC_H + +#endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/c6x/include/asm/vmalloc.h b/arch/c6x/include/asm/vmalloc.h new file mode 100644 index 000000000000..26c6c6696bbd --- /dev/null +++ b/arch/c6x/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_C6X_VMALLOC_H +#define _ASM_C6X_VMALLOC_H + +#endif /* _ASM_C6X_VMALLOC_H */ diff --git a/arch/csky/include/asm/vmalloc.h b/arch/csky/include/asm/vmalloc.h new file mode 100644 index 000000000000..43dca6336b4c --- /dev/null +++ b/arch/csky/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_CSKY_VMALLOC_H +#define _ASM_CSKY_VMALLOC_H + +#endif /* _ASM_CSKY_VMALLOC_H */ diff --git a/arch/h8300/include/asm/vmalloc.h b/arch/h8300/include/asm/vmalloc.h new file mode 100644 index 000000000000..08a55c1dfa23 --- /dev/null +++ b/arch/h8300/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_H8300_VMALLOC_H +#define _ASM_H8300_VMALLOC_H + +#endif /* _ASM_H8300_VMALLOC_H */ diff --git a/arch/hexagon/include/asm/vmalloc.h b/arch/hexagon/include/asm/vmalloc.h new file mode 100644 index 000000000000..7b04609e525c --- /dev/null +++ b/arch/hexagon/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_HEXAGON_VMALLOC_H +#define _ASM_HEXAGON_VMALLOC_H + +#endif /* _ASM_HEXAGON_VMALLOC_H */ diff --git a/arch/ia64/include/asm/vmalloc.h b/arch/ia64/include/asm/vmalloc.h new file mode 100644 index 000000000000..a2b51141ad28 --- /dev/null +++ b/arch/ia64/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_IA64_VMALLOC_H +#define _ASM_IA64_VMALLOC_H + +#endif /* _ASM_IA64_VMALLOC_H */ diff --git a/arch/m68k/include/asm/vmalloc.h b/arch/m68k/include/asm/vmalloc.h new file mode 100644 index 000000000000..bc1dca6cf134 --- /dev/null +++ b/arch/m68k/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_M68K_VMALLOC_H +#define _ASM_M68K_VMALLOC_H + +#endif /* _ASM_M68K_VMALLOC_H */ diff --git a/arch/microblaze/include/asm/vmalloc.h b/arch/microblaze/include/asm/vmalloc.h new file mode 100644 index 000000000000..04013a42b0fe --- /dev/null +++ b/arch/microblaze/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_MICROBLAZE_VMALLOC_H +#define _ASM_MICROBLAZE_VMALLOC_H + +#endif /* _ASM_MICROBLAZE_VMALLOC_H */ diff --git a/arch/mips/include/asm/vmalloc.h b/arch/mips/include/asm/vmalloc.h new file mode 100644 index 000000000000..25dc09b25eaf --- /dev/null +++ b/arch/mips/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_MIPS_VMALLOC_H +#define _ASM_MIPS_VMALLOC_H + +#endif /* _ASM_MIPS_VMALLOC_H */ diff --git a/arch/nds32/include/asm/vmalloc.h b/arch/nds32/include/asm/vmalloc.h new file mode 100644 index 000000000000..caeed3898419 --- /dev/null +++ b/arch/nds32/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_NDS32_VMALLOC_H +#define _ASM_NDS32_VMALLOC_H + +#endif /* _ASM_NDS32_VMALLOC_H */ diff --git a/arch/nios2/include/asm/vmalloc.h b/arch/nios2/include/asm/vmalloc.h new file mode 100644 index 000000000000..ec7a9260090b --- /dev/null +++ b/arch/nios2/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_NIOS2_VMALLOC_H +#define _ASM_NIOS2_VMALLOC_H + +#endif /* _ASM_NIOS2_VMALLOC_H */ diff --git a/arch/openrisc/include/asm/vmalloc.h b/arch/openrisc/include/asm/vmalloc.h new file mode 100644 index 000000000000..75435eceec32 --- /dev/null +++ b/arch/openrisc/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_OPENRISC_VMALLOC_H +#define _ASM_OPENRISC_VMALLOC_H + +#endif /* _ASM_OPENRISC_VMALLOC_H */ diff --git a/arch/parisc/include/asm/vmalloc.h b/arch/parisc/include/asm/vmalloc.h new file mode 100644 index 000000000000..1088ae4e7af9 --- /dev/null +++ b/arch/parisc/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_PARISC_VMALLOC_H +#define _ASM_PARISC_VMALLOC_H + +#endif /* _ASM_PARISC_VMALLOC_H */ diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h new file mode 100644 index 000000000000..b992dfaaa161 --- /dev/null +++ b/arch/powerpc/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_POWERPC_VMALLOC_H +#define _ASM_POWERPC_VMALLOC_H + +#endif /* _ASM_POWERPC_VMALLOC_H */ diff --git a/arch/riscv/include/asm/vmalloc.h b/arch/riscv/include/asm/vmalloc.h new file mode 100644 index 000000000000..ff9abc00d139 --- /dev/null +++ b/arch/riscv/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_RISCV_VMALLOC_H +#define _ASM_RISCV_VMALLOC_H + +#endif /* _ASM_RISCV_VMALLOC_H */ diff --git a/arch/s390/include/asm/vmalloc.h b/arch/s390/include/asm/vmalloc.h new file mode 100644 index 000000000000..3ba3a6bdca25 --- /dev/null +++ b/arch/s390/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_S390_VMALLOC_H +#define _ASM_S390_VMALLOC_H + +#endif /* _ASM_S390_VMALLOC_H */ diff --git a/arch/sh/include/asm/vmalloc.h b/arch/sh/include/asm/vmalloc.h new file mode 100644 index 000000000000..716b77472646 --- /dev/null +++ b/arch/sh/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_SH_VMALLOC_H +#define _ASM_SH_VMALLOC_H + +#endif /* _ASM_SH_VMALLOC_H */ diff --git a/arch/sparc/include/asm/vmalloc.h b/arch/sparc/include/asm/vmalloc.h new file mode 100644 index 000000000000..04b8ab9518b8 --- /dev/null +++ b/arch/sparc/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_SPARC_VMALLOC_H +#define _ASM_SPARC_VMALLOC_H + +#endif /* _ASM_SPARC_VMALLOC_H */ diff --git a/arch/um/include/asm/vmalloc.h b/arch/um/include/asm/vmalloc.h new file mode 100644 index 000000000000..9a7b9ed93733 --- /dev/null +++ b/arch/um/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_UM_VMALLOC_H +#define _ASM_UM_VMALLOC_H + +#endif /* _ASM_UM_VMALLOC_H */ diff --git a/arch/unicore32/include/asm/vmalloc.h b/arch/unicore32/include/asm/vmalloc.h new file mode 100644 index 000000000000..054435818a14 --- /dev/null +++ b/arch/unicore32/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_UNICORE32_VMALLOC_H +#define _ASM_UNICORE32_VMALLOC_H + +#endif /* _ASM_UNICORE32_VMALLOC_H */ diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h new file mode 100644 index 000000000000..ba6295fa5876 --- /dev/null +++ b/arch/x86/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_X86_VMALLOC_H +#define _ASM_X86_VMALLOC_H + +#endif /* _ASM_X86_VMALLOC_H */ diff --git a/arch/xtensa/include/asm/vmalloc.h b/arch/xtensa/include/asm/vmalloc.h new file mode 100644 index 000000000000..0eb94b70be55 --- /dev/null +++ b/arch/xtensa/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_XTENSA_VMALLOC_H +#define _ASM_XTENSA_VMALLOC_H + +#endif /* _ASM_XTENSA_VMALLOC_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 239571c69d6b..ab60c69a4097 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -10,6 +10,8 @@ #include <linux/rbtree.h> #include <linux/overflow.h>
+#include <asm/vmalloc.h> + struct vm_area_struct; /* vma defining user mapping in mm_types.h */ struct notifier_block; /* in notifier.h */
From: Daniel Axtens dja@axtens.net
mainline inclusion from mainline-v5.5-rc3 commit be1db4753ee6a0db80a900df9dbbf6ad2acc4bd1 category: feature bugzilla: NA CVE: NA
---------------------------
apply_to_page_range() takes an address range, and if any parts of it are not covered by the existing page table hierarchy, it allocates memory to fill them in.
In some use cases, this is not what we want - we want to be able to operate exclusively on PTEs that are already in the tables.
Add apply_to_existing_page_range() for this. Adjust the walker functions for apply_to_page_range to take 'create', which switches them between the old and new modes.
This will be used in KASAN vmalloc.
[akpm@linux-foundation.org: reduce code duplication] [akpm@linux-foundation.org: s/apply_to_existing_pages/apply_to_existing_page_range/] [akpm@linux-foundation.org: initialize __apply_to_page_range::err] Link: http://lkml.kernel.org/r/20191205140407.1874-1-dja@axtens.net Signed-off-by: Daniel Axtens dja@axtens.net Cc: Dmitry Vyukov dvyukov@google.com Cc: Uladzislau Rezki (Sony) urezki@gmail.com Cc: Alexander Potapenko glider@google.com Cc: Daniel Axtens dja@axtens.net Cc: Qian Cai cai@lca.pw Cc: Andrey Ryabinin aryabinin@virtuozzo.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm.h | 3 + mm/memory.c | 136 +++++++++++++++++++++++++++++++-------------- 2 files changed, 97 insertions(+), 42 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index d26d870f68cc..0ff3de89f897 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2642,6 +2642,9 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +extern int apply_to_existing_page_range(struct mm_struct *mm, + unsigned long address, unsigned long size, + pte_fn_t fn, void *data);
#ifdef CONFIG_PAGE_POISONING diff --git a/mm/memory.c b/mm/memory.c index 2c5a66af85bb..20d94eed5de9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1956,18 +1956,24 @@ EXPORT_SYMBOL(vm_iomap_memory);
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { pte_t *pte; - int err; + int err = 0; pgtable_t token; spinlock_t *uninitialized_var(ptl);
- pte = (mm == &init_mm) ? - pte_alloc_kernel(pmd, addr) : - pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -ENOMEM; + if (create) { + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + } else { + pte = (mm == &init_mm) ? + pte_offset_kernel(pmd, addr) : + pte_offset_map_lock(mm, pmd, addr, &ptl); + }
BUG_ON(pmd_huge(*pmd));
@@ -1976,9 +1982,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd);
do { - err = fn(pte++, token, addr, data); - if (err) - break; + if (create || !pte_none(*pte)) { + err = fn(pte++, token, addr, data); + if (err) + break; + } } while (addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode(); @@ -1990,77 +1998,95 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { pmd_t *pmd; unsigned long next; - int err; + int err = 0;
BUG_ON(pud_huge(*pud));
- pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return -ENOMEM; + if (create) { + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + } else { + pmd = pmd_offset(pud, addr); + } do { next = pmd_addr_end(addr, end); - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); - if (err) - break; + if (create || !pmd_none_or_clear_bad(pmd)) { + err = apply_to_pte_range(mm, pmd, addr, next, fn, data, + create); + if (err) + break; + } } while (pmd++, addr = next, addr != end); return err; }
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { pud_t *pud; unsigned long next; - int err; + int err = 0;
- pud = pud_alloc(mm, p4d, addr); - if (!pud) - return -ENOMEM; + if (create) { + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return -ENOMEM; + } else { + pud = pud_offset(p4d, addr); + } do { next = pud_addr_end(addr, end); - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); - if (err) - break; + if (create || !pud_none_or_clear_bad(pud)) { + err = apply_to_pmd_range(mm, pud, addr, next, fn, data, + create); + if (err) + break; + } } while (pud++, addr = next, addr != end); return err; }
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { p4d_t *p4d; unsigned long next; - int err; + int err = 0;
- p4d = p4d_alloc(mm, pgd, addr); - if (!p4d) - return -ENOMEM; + if (create) { + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return -ENOMEM; + } else { + p4d = p4d_offset(pgd, addr); + } do { next = p4d_addr_end(addr, end); - err = apply_to_pud_range(mm, p4d, addr, next, fn, data); - if (err) - break; + if (create || !p4d_none_or_clear_bad(p4d)) { + err = apply_to_pud_range(mm, p4d, addr, next, fn, data, + create); + if (err) + break; + } } while (p4d++, addr = next, addr != end); return err; }
-/* - * Scan a region of virtual memory, filling in page tables as necessary - * and calling a provided function on each leaf page table. - */ -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, - unsigned long size, pte_fn_t fn, void *data) +static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, + void *data, bool create) { pgd_t *pgd; unsigned long next; unsigned long end = addr + size; - int err; + int err = 0;
if (WARN_ON(addr >= end)) return -EINVAL; @@ -2068,15 +2094,41 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); + if (!create && pgd_none_or_clear_bad(pgd)) + continue; + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create); if (err) break; } while (pgd++, addr = next, addr != end);
return err; } + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + return __apply_to_page_range(mm, addr, size, fn, data, true); +} EXPORT_SYMBOL_GPL(apply_to_page_range);
+/* + * Scan a region of virtual memory, calling a provided function on + * each leaf page table where it exists. + * + * Unlike apply_to_page_range, this does _not_ fill in page tables + * where they are absent. + */ +int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + return __apply_to_page_range(mm, addr, size, fn, data, false); +} +EXPORT_SYMBOL_GPL(apply_to_existing_page_range); + /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures
From: Steven Price steven.price@arm.com
mainline inclusion from mainline-v5.6-rc1 commit 93fab1b22ef7c4abcbc760ce4432762b02e7f3d1 category: feature bugzilla: NA CVE: NA
---------------------------
Patch series "Generic page walk and ptdump", v17.
Many architectures current have a debugfs file for dumping the kernel page tables. Currently each architecture has to implement custom functions for this because the details of walking the page tables used by the kernel are different between architectures.
This series extends the capabilities of walk_page_range() so that it can deal with the page tables of the kernel (which have no VMAs and can contain larger huge pages than exist for user space). A generic PTDUMP implementation is the implemented making use of the new functionality of walk_page_range() and finally arm64 and x86 are switch to using it, removing the custom table walkers.
To enable a generic page table walker to walk the unusual mappings of the kernel we need to implement a set of functions which let us know when the walker has reached the leaf entry. After a suggestion from Will Deacon I've chosen the name p?d_leaf() as this (hopefully) describes the purpose (and is a new name so has no historic baggage). Some architectures have p?d_large macros but this is easily confused with "large pages".
This series ends with a generic PTDUMP implemention for arm64 and x86.
Mostly this is a clean up and there should be very little functional change. The exceptions are:
* arm64 PTDUMP debugfs now displays pages which aren't present (patch 22).
* arm64 has the ability to efficiently process KASAN pages (which previously only x86 implemented). This means that the combination of KASAN and DEBUG_WX is now useable.
This patch (of 23):
Exposing the pud/pgd levels of the page tables to walk_page_range() means we may come across the exotic large mappings that come with large areas of contiguous memory (such as the kernel's linear map).
For architectures that don't provide all p?d_leaf() macros, provide generic do nothing default that are suitable where there cannot be leaf pages at that level. Futher patches will add implementations for individual architectures.
The name p?d_leaf() is chosen to minimize the confusion with existing uses of "large" pages and "huge" pages which do not necessary mean that the entry is a leaf (for example it may be a set of contiguous entries that only take 1 TLB slot). For the purpose of walking the page tables we don't need to know how it will be represented in the TLB, but we do need to know for sure if it is a leaf of the tree.
Link: http://lkml.kernel.org/r/20191218162402.45610-2-steven.price@arm.com Signed-off-by: Steven Price steven.price@arm.com Acked-by: Mark Rutland mark.rutland@arm.com Cc: Andy Lutomirski luto@kernel.org Cc: Ard Biesheuvel ard.biesheuvel@linaro.org Cc: Arnd Bergmann arnd@arndb.de Cc: Borislav Petkov bp@alien8.de Cc: Catalin Marinas catalin.marinas@arm.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Ingo Molnar mingo@redhat.com Cc: James Morse james.morse@arm.com Cc: Jerome Glisse jglisse@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Will Deacon will@kernel.org Cc: "H. Peter Anvin" hpa@zytor.com Cc: "Liang, Kan" kan.liang@linux.intel.com Cc: Albert Ou aou@eecs.berkeley.edu Cc: Alexandre Ghiti alex@ghiti.fr Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: David S. Miller davem@davemloft.net Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: James Hogan jhogan@kernel.org Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Burton paul.burton@mips.com Cc: Paul Mackerras paulus@samba.org Cc: Paul Walmsley paul.walmsley@sifive.com Cc: Ralf Baechle ralf@linux-mips.org Cc: Russell King linux@armlinux.org.uk Cc: Vasily Gorbik gor@linux.ibm.com Cc: Vineet Gupta vgupta@synopsys.com Cc: Zong Li zong.li@sifive.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/asm-generic/pgtable.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a7912c229033..124d2e3f6406 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1188,4 +1188,24 @@ static inline bool arch_has_pfn_modify_check(void) #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif
+/* + * p?d_leaf() - true if this entry is a final mapping to a physical address. + * This differs from p?d_huge() by the fact that they are always available (if + * the architecture supports large pages at the appropriate level) even + * if CONFIG_HUGETLB_PAGE is not defined. + * Only meaningful when called on a valid entry. + */ +#ifndef pgd_leaf +#define pgd_leaf(x) 0 +#endif +#ifndef p4d_leaf +#define p4d_leaf(x) 0 +#endif +#ifndef pud_leaf +#define pud_leaf(x) 0 +#endif +#ifndef pmd_leaf +#define pmd_leaf(x) 0 +#endif + #endif /* _ASM_GENERIC_PGTABLE_H */
From: Steven Price steven.price@arm.com
mainline inclusion from mainline-v5.6-rc1 commit 8aa82df3c123129025a364d8f823929cc488b834 category: feature bugzilla: NA CVE: NA
---------------------------
walk_page_range() is going to be allowed to walk page tables other than those of user space. For this it needs to know when it has reached a 'leaf' entry in the page tables. This information will be provided by the p?d_leaf() functions/macros.
For arm64, we already have p?d_sect() macros which we can reuse for p?d_leaf().
pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3 or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is configured this way then architecturally it isn't allowed to have a large page at this level, and any code using these page walking macros is implicitly relying on the page size/number of levels being the same as the kernel. So it is safe to reuse this for p?d_leaf() as it is an architectural restriction.
Link: http://lkml.kernel.org/r/20191218162402.45610-5-steven.price@arm.com Signed-off-by: Steven Price steven.price@arm.com Acked-by: Catalin Marinas catalin.marinas@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Albert Ou aou@eecs.berkeley.edu Cc: Alexandre Ghiti alex@ghiti.fr Cc: Andy Lutomirski luto@kernel.org Cc: Ard Biesheuvel ard.biesheuvel@linaro.org Cc: Arnd Bergmann arnd@arndb.de Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Borislav Petkov bp@alien8.de Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: David S. Miller davem@davemloft.net Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: "H. Peter Anvin" hpa@zytor.com Cc: Ingo Molnar mingo@redhat.com Cc: James Hogan jhogan@kernel.org Cc: James Morse james.morse@arm.com Cc: Jerome Glisse jglisse@redhat.com Cc: "Liang, Kan" kan.liang@linux.intel.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Burton paul.burton@mips.com Cc: Paul Mackerras paulus@samba.org Cc: Paul Walmsley paul.walmsley@sifive.com Cc: Peter Zijlstra peterz@infradead.org Cc: Ralf Baechle ralf@linux-mips.org Cc: Russell King linux@armlinux.org.uk Cc: Thomas Gleixner tglx@linutronix.de Cc: Vasily Gorbik gor@linux.ibm.com Cc: Vineet Gupta vgupta@synopsys.com Cc: Zong Li zong.li@sifive.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0662acc42d57..f045dce2e781 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -419,6 +419,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) +#define pmd_leaf(pmd) pmd_sect(pmd)
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } @@ -484,6 +485,7 @@ static inline void pte_unmap(pte_t *pte) { } #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) #define pud_present(pud) pte_present(pud_pte(pud)) +#define pud_leaf(pud) pud_sect(pud) #define pud_valid(pud) pte_valid(pud_pte(pud))
static inline void set_pud(pud_t *pudp, pud_t pud)
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit 4926627793c0a7e7db2bc674e1d06777e86d8dab category: feature bugzilla: NA CVE: NA
---------------------------
Switch the two remaining callers to use __get_vm_area_caller instead.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-9-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/powerpc/kernel/pci_64.c | 3 ++- arch/sh/kernel/cpu/sh4/sq.c | 3 ++- include/linux/vmalloc.h | 2 -- mm/vmalloc.c | 8 -------- 4 files changed, 4 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index dff28f903512..ac956dfdbbde 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -144,7 +144,8 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) * with incomplete address decoding but I'd rather not deal with * those outside of the reserved 64K legacy region. */ - area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END); + area = __get_vm_area_caller(size_page, 0, PHB_IO_BASE, PHB_IO_END, + __builtin_return_address(0)); if (area == NULL) return -ENOMEM; hose->io_base_alloc = area->addr; diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index 4ca78ed71ad2..fb97dab3bad7 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -106,7 +106,8 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma;
- vma = __get_vm_area(map->size, VM_ALLOC, map->sq_addr, SQ_ADDRMAX); + vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ab60c69a4097..b789a8fd5138 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -133,8 +133,6 @@ static inline size_t get_vm_area_size(const struct vm_struct *area) extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller); -extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d768f4064a22..4c6d7762e4a0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2000,14 +2000,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; }
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller)
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit 8f87cc9386dc7965de151605637eee939ea0d098 category: feature bugzilla: NA CVE: NA
This commit will break kabi for unexporting an interface. ---------------------------
There are no modular users of this function.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-10-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4c6d7762e4a0..5f14a7d294cf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1910,7 +1910,6 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { vunmap_page_range(addr, addr + size); } -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
/** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit b607e6d17db5b91e6a807b4f9a2e849219d720a0 category: feature bugzilla: NA CVE: NA
---------------------------
This allows to unexport map_vm_area and unmap_kernel_range, which are rather deep internal and should not be available to modules, as they for example allow fine grained control of mapping permissions, and also allow splitting the setup of a vmalloc area and the actual mapping and thus expose vmalloc internals.
zsmalloc is typically built-in and continues to work (just like the percpu-vm code using a similar patter), while modular zsmalloc also continues to work, but must use copies.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-12-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/Kconfig | 2 +- mm/vmalloc.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/mm/Kconfig b/mm/Kconfig index 7ebd52dc1e40..dddeb30d645e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -617,7 +617,7 @@ config ZSMALLOC
config PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC + depends on ZSMALLOC=y help By default, zsmalloc uses a copy-based object mapping method to access allocations that span two pages. However, if a particular diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5f14a7d294cf..66b0a7d03b28 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1927,7 +1927,6 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } -EXPORT_SYMBOL_GPL(unmap_kernel_range);
int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { @@ -1939,7 +1938,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
return err > 0 ? 0 : err; } -EXPORT_SYMBOL_GPL(map_vm_area);
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller)
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit 78a0e8c4837f42e9c2b1127e9c450ceeb0efbde6 category: feature bugzilla: NA CVE: NA
---------------------------
Ever use of addr in vb_free casts to unsigned long first, and the caller has an unsigned long version of the address available anyway. Just pass that and avoid all the casts.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-13-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 66b0a7d03b28..e7d95dee4458 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1556,7 +1556,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) return vaddr; }
-static void vb_free(const void *addr, unsigned long size) +static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned long vb_idx; @@ -1566,24 +1566,22 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + flush_cache_vunmap(addr, addr + size);
order = get_order(size);
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); - offset >>= PAGE_SHIFT; + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
- vb_idx = addr_to_vb_idx((unsigned long)addr); + vb_idx = addr_to_vb_idx(addr); rcu_read_lock(); vb = radix_tree_lookup(&vmap_block_tree, vb_idx); rcu_read_unlock(); BUG_ON(!vb);
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + vunmap_page_range(addr, addr + size);
if (debug_pagealloc_enabled()) - flush_tlb_kernel_range((unsigned long)addr, - (unsigned long)addr + size); + flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
@@ -1675,7 +1673,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); - vb_free(mem, size); + vb_free(addr, size); return; }
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit b521c43f58e5234ee9b29817ed5e93523abcffa9 category: feature bugzilla: NA CVE: NA
---------------------------
These have non-static aliases called map_kernel_range_noflush and unmap_kernel_range_noflush that just differ slightly in the calling conventions that pass addr + size instead of an end.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-14-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 98 +++++++++++++++++++++------------------------------- 1 file changed, 40 insertions(+), 58 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e7d95dee4458..589bb3bcd315 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -118,10 +118,24 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) } while (p4d++, addr = next, addr != end); }
-static void vunmap_page_range(unsigned long addr, unsigned long end) +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { - pgd_t *pgd; + unsigned long end = addr + size; unsigned long next; + pgd_t *pgd;
BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -210,18 +224,30 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, return 0; }
-/* - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and - * will have pfns corresponding to the "pages" array. +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map * - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) { - pgd_t *pgd; + unsigned long end = addr + size; unsigned long next; - unsigned long addr = start; + pgd_t *pgd; int err = 0; int nr = 0;
@@ -242,7 +268,7 @@ static int vmap_page_range(unsigned long start, unsigned long end, { int ret;
- ret = vmap_page_range_noflush(start, end, prot, pages); + ret = map_kernel_range_noflush(start, end - start, prot, pages); flush_cache_vmap(start, end); return ret; } @@ -1147,7 +1173,7 @@ static void free_vmap_area(struct vmap_area *va) */ static void unmap_vmap_area(struct vmap_area *va) { - vunmap_page_range(va->va_start, va->va_end); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); }
/* @@ -1578,7 +1604,7 @@ static void vb_free(unsigned long addr, unsigned long size) rcu_read_unlock(); BUG_ON(!vb);
- vunmap_page_range(addr, addr + size); + unmap_kernel_range_noflush(addr, size);
if (debug_pagealloc_enabled()) flush_tlb_kernel_range(addr, addr + size); @@ -1865,50 +1891,6 @@ void __init vmalloc_init(void) vmap_initialized = true; }
-/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} - /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB * @addr: start of the VM area to unmap @@ -1922,7 +1904,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) unsigned long end = addr + size;
flush_cache_vunmap(addr, end); - vunmap_page_range(addr, end); + unmap_kernel_range_noflush(addr, size); flush_tlb_kernel_range(addr, end); }
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit a29adb6209cead1f6c34a8d72481fb183bfc2d68 category: feature bugzilla: NA CVE: NA
---------------------------
This matches the map_kernel_range_noflush API. Also change to pass a size instead of the end, similar to the noflush version.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-15-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 589bb3bcd315..a7edde4d6d23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -263,13 +263,13 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return nr; }
-static int vmap_page_range(unsigned long start, unsigned long end, +static int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages) { int ret;
- ret = map_kernel_range_noflush(start, end - start, prot, pages); - flush_cache_vmap(start, end); + ret = map_kernel_range_noflush(start, size, prot, pages); + flush_cache_vmap(start, start + size); return ret; }
@@ -1747,7 +1747,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro addr = va->va_start; mem = (void *)addr; } - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + if (map_kernel_range(addr, size, prot, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } @@ -1911,10 +1911,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + get_vm_area_size(area); int err;
- err = vmap_page_range(addr, end, prot, pages); + err = map_kernel_range(addr, get_vm_area_size(area), prot, pages);
return err > 0 ? 0 : err; }
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit 60bb44652a0dcc44acfc2ed8ebb35e4a389e5421 category: feature bugzilla: NA CVE: NA
---------------------------
None of the callers needs the number of pages, and a 0 / -errno return value is a lot more intuitive.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-16-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a7edde4d6d23..ada7d291fdca 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -240,7 +240,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, * function. * * RETURNS: - * The number of pages mapped on success, -errno on failure. + * 0 on success, -errno on failure. */ int map_kernel_range_noflush(unsigned long addr, unsigned long size, pgprot_t prot, struct page **pages) @@ -260,7 +260,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return err; } while (pgd++, addr = next, addr != end);
- return nr; + return 0; }
static int map_kernel_range(unsigned long start, unsigned long size,
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v5.8-rc1 commit ed1f324c5fed06c91f30a36aedb66f34244ab86e category: feature bugzilla: NA CVE: NA
Add adjustment for map_vm_area in dma mapping, and don't remove in ceph_common.
---------------------------
Switch all callers to map_kernel_range, which symmetric to the unmap side (as well as the _noflush versions).
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: David Airlie airlied@linux.ie Cc: Gao Xiang xiang@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Laura Abbott labbott@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Michael Kelley mikelley@microsoft.com Cc: Minchan Kim minchan@kernel.org Cc: Nitin Gupta ngupta@vflare.org Cc: Robin Murphy robin.murphy@arm.com Cc: Sakari Ailus sakari.ailus@linux.intel.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Wei Liu wei.liu@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Paul Mackerras paulus@ozlabs.org Cc: Vasily Gorbik gor@linux.ibm.com Cc: Will Deacon will@kernel.org Link: http://lkml.kernel.org/r/20200414131348.444715-17-hch@lst.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/core-api/cachetlb.rst | 2 +- include/linux/vmalloc.h | 10 ++++------ kernel/dma/mapping.c | 3 ++- mm/vmalloc.c | 21 +++++++-------------- mm/zsmalloc.c | 4 +++- 5 files changed, 17 insertions(+), 23 deletions(-)
diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 6eb9d3f090cd..a2fed63e6929 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -223,7 +223,7 @@ Here are the routines, one by one: there will be no entries in the cache for the kernel address space for virtual addresses in the range 'start' to 'end-1'.
- The first of these two routines is invoked after map_vm_area() + The first of these two routines is invoked after map_kernel_range() has installed the page table entries. The second is invoked before unmap_kernel_range() deletes the page table entries.
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b789a8fd5138..086013413a1c 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -140,11 +140,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr);
-extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); #else @@ -154,14 +154,12 @@ map_kernel_range_noflush(unsigned long start, unsigned long size, { return size >> PAGE_SHIFT; } +#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { } -static inline void -unmap_kernel_range(unsigned long addr, unsigned long size) -{ -} +#define unmap_kernel_range unmap_kernel_range_noflush #endif
/* Allocate/destroy a 'vmalloc' VM area. */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d2a92ddaac4d..9cc1b34cab8f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -255,7 +255,8 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, if (!area) return NULL;
- if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, prot, + pages) < 0) { vunmap(area->addr); return NULL; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ada7d291fdca..7f9ed0be261c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -263,8 +263,8 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; }
-static int map_kernel_range(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret;
@@ -1908,16 +1908,6 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) flush_tlb_kernel_range(addr, end); }
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - int err; - - err = map_kernel_range(addr, get_vm_area_size(area), prot, pages); - - return err > 0 ? 0 : err; -} - static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2202,7 +2192,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL;
- if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, prot, + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2263,8 +2254,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, cond_resched(); }
- if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr;
fail: diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 85cc29c93d93..45a53ffd7c97 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1156,7 +1156,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; }
From: Anshuman Khandual anshuman.khandual@arm.com
mainline inclusion from mainline-v5.3-rc1 commit 0f472d04f59ff89d15b2a1c4eafde7317ddd67a2 category: feature bugzilla: NA CVE: NA
Drop the support for powerpc platform. ---------------------------
Finish up what commit c2febafc6773 ("mm: convert generic code to 5-level paging") started while levelling up P4D huge mapping support at par with PUD and PMD. A new arch call back arch_ioremap_p4d_supported() is added which just maintains status quo (P4D huge map not supported) on x86, arm64 and powerpc.
When HAVE_ARCH_HUGE_VMAP is enabled its just a simple check from the arch about the support, hence runtime effects are minimal.
Link: http://lkml.kernel.org/r/1561699231-20991-1-git-send-email-anshuman.khandual... Signed-off-by: Anshuman Khandual anshuman.khandual@arm.com Acked-by: Thomas Gleixner tglx@linutronix.de Acked-by: Michael Ellerman mpe@ellerman.id.au (powerpc) Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will.deacon@arm.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Andy Lutomirski luto@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Ingo Molnar mingo@redhat.com Cc: Kirill A. Shutemov kirill.shutemov@linux.intel.com Cc: Michal Hocko mhocko@kernel.org Cc: Stephen Rothwell sfr@canb.auug.org.au Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/mm/mmu.c | 5 +++++ arch/x86/mm/ioremap.c | 5 +++++ include/linux/io.h | 1 + lib/ioremap.c | 2 ++ 4 files changed, 13 insertions(+)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 99a86326a301..437ee4ab41a1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -921,6 +921,11 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; }
+int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} + int __init arch_ioremap_pud_supported(void) { /* diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index adc77904fc3e..6ce8a2b593e0 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -431,6 +431,11 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap);
+int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} + int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 diff --git a/include/linux/io.h b/include/linux/io.h index da39ff89df65..6cbd11b951ec 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -45,6 +45,7 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP void __init ioremap_huge_init(void); +int arch_ioremap_p4d_supported(void); int arch_ioremap_pud_supported(void); int arch_ioremap_pmd_supported(void); #else diff --git a/lib/ioremap.c b/lib/ioremap.c index 063213685563..c3dc213b6980 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -30,6 +30,8 @@ early_param("nohugeiomap", set_nohugeiomap); void __init ioremap_huge_init(void) { if (!ioremap_huge_disabled) { + if (arch_ioremap_p4d_supported()) + ioremap_p4d_capable = 1; if (arch_ioremap_pud_supported()) ioremap_pud_capable = 1; if (arch_ioremap_pmd_supported())
From: Mike Rapoport rppt@linux.ibm.com
mainline inclusion from mainline-v5.9-rc1 commit ab05eabfa18a6a537a73408642177e4a803317dc category: feature bugzilla: NA CVE: NA
---------------------------
The functionality in lib/ioremap.c deals with pagetables, vmalloc and caches, so it naturally belongs to mm/ Moving it there will also allow declaring p?d_alloc_track functions in an header file inside mm/ rather than having those declarations in include/linux/mm.h
Suggested-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Mike Rapoport rppt@linux.ibm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Pekka Enberg penberg@kernel.org Cc: Abdul Haleem abdhalee@linux.vnet.ibm.com Cc: Andy Lutomirski luto@kernel.org Cc: Arnd Bergmann arnd@arndb.de Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Joerg Roedel joro@8bytes.org Cc: Joerg Roedel jroedel@suse.de Cc: Max Filippov jcmvbkbc@gmail.com Cc: Peter Zijlstra (Intel) peterz@infradead.org Cc: Satheesh Rajendran sathnaga@linux.vnet.ibm.com Cc: Stafford Horne shorne@gmail.com Cc: Stephen Rothwell sfr@canb.auug.org.au Cc: Steven Rostedt rostedt@goodmis.org Cc: Geert Uytterhoeven geert@linux-m68k.org Cc: Matthew Wilcox willy@infradead.org Link: http://lkml.kernel.org/r/20200627143453.31835-8-rppt@kernel.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- lib/Makefile | 1 - mm/Makefile | 2 +- {lib => mm}/ioremap.c | 0 3 files changed, 1 insertion(+), 2 deletions(-) rename {lib => mm}/ioremap.c (100%)
diff --git a/lib/Makefile b/lib/Makefile index 1d7a705d7207..f92a84934ab2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -38,7 +38,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ nmi_backtrace.o nodemask.o win_minmax.o
lib-$(CONFIG_PRINTK) += dump_stack.o -lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o
lib-y += kobject.o klist.o diff --git a/mm/Makefile b/mm/Makefile index cb28266b0677..d71892c0bbf1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -25,7 +25,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o + pgtable-generic.o rmap.o vmalloc.o ioremap.o
ifdef CONFIG_CROSS_MEMORY_ATTACH diff --git a/lib/ioremap.c b/mm/ioremap.c similarity index 100% rename from lib/ioremap.c rename to mm/ioremap.c
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-2-npiggin@gmail.com/ --------------
vmalloc_to_page returns NULL for addresses mapped by larger pages[*]. Whether or not a vmap is huge depends on the architecture details, alignments, boot options, etc., which the caller can not be expected to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page.
This change teaches vmalloc_to_page about larger pages, and returns the struct page that corresponds to the offset within the large page. This makes the API agnostic to mapping implementation details.
[*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap: fail gracefully on unexpected huge vmap mappings")
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7f9ed0be261c..bc8c03a18763 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> +#include <asm/pgtable.h>
#include "internal.h"
@@ -289,7 +290,9 @@ int is_vmalloc_or_module_addr(const void *x) }
/* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the struct page it maps. Huge vmap mappings will + * return the tail page that corresponds to the base page address, which + * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { @@ -309,25 +312,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pgd_none(*pgd)) return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; - pud = pud_offset(p4d, addr); + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL;
- /* - * Don't dereference bad PUD or PMD (below) entries. This will also - * identify huge mappings, which we may encounter on architectures - * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be - * identified as vmalloc addresses by is_vmalloc_addr(), but are - * not [unambiguously] associated with a struct page, so there is - * no correct value to return for them. - */ - WARN_ON_ONCE(pud_bad(*pud)); - if (pud_none(*pud) || pud_bad(*pud)) + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; + pmd = pmd_offset(pud, addr); - WARN_ON_ONCE(pmd_bad(*pmd)); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL;
ptep = pte_offset_map(pmd, addr); @@ -335,6 +346,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pte_present(pte)) page = pte_page(pte); pte_unmap(ptep); + return page; } EXPORT_SYMBOL(vmalloc_to_page);
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-3-npiggin@gmail.com/ --------------
apply_to_pte_range might mistake a large pte for bad, or treat it as a page table, resulting in a crash or corruption. Add a test to warn and return error if large entries are found.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memory.c | 60 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 16 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c index 20d94eed5de9..704d923985dd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2015,13 +2015,20 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, } do { next = pmd_addr_end(addr, end); - if (create || !pmd_none_or_clear_bad(pmd)) { - err = apply_to_pte_range(mm, pmd, addr, next, fn, data, - create); - if (err) - break; + if (pmd_none(*pmd) && !create) + continue; + if (WARN_ON_ONCE(pmd_leaf(*pmd))) + return -EINVAL; + if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { + if (!create) + continue; + pmd_clear_bad(pmd); } + err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create); + if (err) + break; } while (pmd++, addr = next, addr != end); + return err; }
@@ -2042,13 +2049,20 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, } do { next = pud_addr_end(addr, end); - if (create || !pud_none_or_clear_bad(pud)) { - err = apply_to_pmd_range(mm, pud, addr, next, fn, data, - create); - if (err) - break; + if (pud_none(*pud) && !create) + continue; + if (WARN_ON_ONCE(pud_leaf(*pud))) + return -EINVAL; + if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { + if (!create) + continue; + pud_clear_bad(pud); } + err = apply_to_pmd_range(mm, pud, addr, next, fn, data, create); + if (err) + break; } while (pud++, addr = next, addr != end); + return err; }
@@ -2069,13 +2083,20 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, } do { next = p4d_addr_end(addr, end); - if (create || !p4d_none_or_clear_bad(p4d)) { - err = apply_to_pud_range(mm, p4d, addr, next, fn, data, - create); - if (err) - break; + if (p4d_none(*p4d) && !create) + continue; + if (WARN_ON_ONCE(p4d_leaf(*p4d))) + return -EINVAL; + if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { + if (!create) + continue; + p4d_clear_bad(p4d); } + err = apply_to_pud_range(mm, p4d, addr, next, fn, data, create); + if (err) + break; } while (p4d++, addr = next, addr != end); + return err; }
@@ -2094,8 +2115,15 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - if (!create && pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd) && !create) continue; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return -EINVAL; + if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { + if (!create) + continue; + pgd_clear_bad(pgd); + } err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create); if (err) break;
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-4-npiggin@gmail.com/ --------------
The vmalloc mapper operates on a struct page * array rather than a linear physical address, re-name it to make this distinction clear.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bc8c03a18763..c0dd884d65cd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -148,7 +148,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) } while (pgd++, addr = next, addr != end); }
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr, +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pte_t *pte; @@ -174,7 +174,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, return 0; }
-static int vmap_pmd_range(pud_t *pud, unsigned long addr, +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pmd_t *pmd; @@ -185,13 +185,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; }
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr, +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pud_t *pud; @@ -202,13 +202,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; }
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr) { p4d_t *p4d; @@ -219,7 +219,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; @@ -256,7 +256,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr); if (err) return err; } while (pgd++, addr = next, addr != end);
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-5-npiggin@gmail.com/ --------------
This will be used as a generic kernel virtual mapping function, so re-name it in preparation.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/ioremap.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-)
diff --git a/mm/ioremap.c b/mm/ioremap.c index c3dc213b6980..0297d2e84032 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -60,7 +60,7 @@ static inline int ioremap_pud_enabled(void) { return 0; } static inline int ioremap_pmd_enabled(void) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pte_t *pte; @@ -78,9 +78,8 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, return 0; }
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { if (!ioremap_pmd_enabled()) return 0; @@ -97,7 +96,7 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, return pmd_set_huge(pmd, phys_addr, prot); }
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pmd_t *pmd; @@ -109,18 +108,17 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end);
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) continue;
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot)) + if (vmap_pte_range(pmd, addr, next, phys_addr, prot)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { if (!ioremap_pud_enabled()) return 0; @@ -137,8 +135,8 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, return pud_set_huge(pud, phys_addr, prot); }
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { pud_t *pud; unsigned long next; @@ -149,18 +147,17 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end);
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) continue;
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot)) + if (vmap_pmd_range(pud, addr, next, phys_addr, prot)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { if (!ioremap_p4d_enabled()) return 0; @@ -177,8 +174,8 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, return p4d_set_huge(p4d, phys_addr, prot); }
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { p4d_t *p4d; unsigned long next; @@ -189,17 +186,17 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end);
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) continue;
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) + if (vmap_pud_range(p4d, addr, next, phys_addr, prot)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
-int ioremap_page_range(unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) +static int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { pgd_t *pgd; unsigned long start; @@ -213,7 +210,7 @@ int ioremap_page_range(unsigned long addr, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); @@ -222,3 +219,9 @@ int ioremap_page_range(unsigned long addr,
return err; } + +int ioremap_page_range(unsigned long addr, + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) +{ + return vmap_range(addr, end, phys_addr, prot); +}
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-6-npiggin@gmail.com/ --------------
This changes the awkward approach where architectures provide init functions to determine which levels they can provide large mappings for, to one where the arch is queried for each call.
This removes code and indirection, and allows constant-folding of dead code for unsupported levels.
This also adds a prot argument to the arch query. This is unused currently but could help with some architectures (e.g., some powerpc processors can't map uncacheable memory with large pages).
Cc: linuxppc-dev@lists.ozlabs.org Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Ingo Molnar mingo@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: x86@kernel.org Cc: "H. Peter Anvin" hpa@zytor.com Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/vmalloc.h | 8 +++ arch/arm64/mm/mmu.c | 10 ++-- arch/powerpc/include/asm/vmalloc.h | 8 +++ arch/x86/include/asm/vmalloc.h | 8 +++ arch/x86/mm/ioremap.c | 10 ++-- include/linux/io.h | 9 --- include/linux/vmalloc.h | 6 ++ init/main.c | 1 - mm/ioremap.c | 91 ++++++++++++------------------ 9 files changed, 77 insertions(+), 74 deletions(-)
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 2ca708ab9b20..597b40405319 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -1,4 +1,12 @@ #ifndef _ASM_ARM64_VMALLOC_H #define _ASM_ARM64_VMALLOC_H
+#include <asm/page.h> + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +bool arch_vmap_p4d_supported(pgprot_t prot); +bool arch_vmap_pud_supported(pgprot_t prot); +bool arch_vmap_pmd_supported(pgprot_t prot); +#endif + #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 437ee4ab41a1..cc44645a997b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -921,12 +921,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; }
-int __init arch_ioremap_p4d_supported(void) +bool arch_vmap_p4d_supported(pgprot_t prot) { - return 0; + return false; }
-int __init arch_ioremap_pud_supported(void) +bool arch_vmap_pud_supported(pgprot_t prot) { /* * Only 4k granule supports level 1 block mappings. @@ -936,9 +936,9 @@ int __init arch_ioremap_pud_supported(void) !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS); }
-int __init arch_ioremap_pmd_supported(void) +bool arch_vmap_pmd_supported(pgprot_t prot) { - /* See arch_ioremap_pud_supported() */ + /* See arch_vmap_pud_supported() */ return !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS); }
diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h index b992dfaaa161..105abb73f075 100644 --- a/arch/powerpc/include/asm/vmalloc.h +++ b/arch/powerpc/include/asm/vmalloc.h @@ -1,4 +1,12 @@ #ifndef _ASM_POWERPC_VMALLOC_H #define _ASM_POWERPC_VMALLOC_H
+#include <asm/page.h> + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +bool arch_vmap_p4d_supported(pgprot_t prot); +bool arch_vmap_pud_supported(pgprot_t prot); +bool arch_vmap_pmd_supported(pgprot_t prot); +#endif + #endif /* _ASM_POWERPC_VMALLOC_H */ diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h index ba6295fa5876..17864fb62e4b 100644 --- a/arch/x86/include/asm/vmalloc.h +++ b/arch/x86/include/asm/vmalloc.h @@ -1,4 +1,12 @@ #ifndef _ASM_X86_VMALLOC_H #define _ASM_X86_VMALLOC_H
+#include <asm/page.h> + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +bool arch_vmap_p4d_supported(pgprot_t prot); +bool arch_vmap_pud_supported(pgprot_t prot); +bool arch_vmap_pmd_supported(pgprot_t prot); +#endif + #endif /* _ASM_X86_VMALLOC_H */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6ce8a2b593e0..944de9aaa0cd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -431,21 +431,21 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap);
-int __init arch_ioremap_p4d_supported(void) +bool arch_vmap_p4d_supported(pgprot_t prot) { - return 0; + return false; }
-int __init arch_ioremap_pud_supported(void) +bool arch_vmap_pud_supported(pgprot_t prot) { #ifdef CONFIG_X86_64 return boot_cpu_has(X86_FEATURE_GBPAGES); #else - return 0; + return false; #endif }
-int __init arch_ioremap_pmd_supported(void) +bool arch_vmap_pmd_supported(pgprot_t prot) { return boot_cpu_has(X86_FEATURE_PSE); } diff --git a/include/linux/io.h b/include/linux/io.h index 6cbd11b951ec..dc2b8faca00a 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -43,15 +43,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, } #endif
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -void __init ioremap_huge_init(void); -int arch_ioremap_p4d_supported(void); -int arch_ioremap_pud_supported(void); -int arch_ioremap_pmd_supported(void); -#else -static inline void ioremap_huge_init(void) { } -#endif - /* * Managed iomap interface */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 086013413a1c..6cbbebc97ea8 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -59,6 +59,12 @@ struct vmap_area { struct vm_struct *vm; };
+#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP +static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; } +static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; } +static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; } +#endif + /* * Highlevel APIs for driver use */ diff --git a/init/main.c b/init/main.c index e37a6f93f5d1..da2bf11f1f40 100644 --- a/init/main.c +++ b/init/main.c @@ -521,7 +521,6 @@ static void __init mm_init(void) kmem_cache_init(); pgtable_init(); vmalloc_init(); - ioremap_huge_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); /* Should be run after espfix64 is set up. */ diff --git a/mm/ioremap.c b/mm/ioremap.c index 0297d2e84032..e4c5044b1ab2 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -15,49 +15,16 @@ #include <asm/pgtable.h>
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static int __read_mostly ioremap_p4d_capable; -static int __read_mostly ioremap_pud_capable; -static int __read_mostly ioremap_pmd_capable; -static int __read_mostly ioremap_huge_disabled; +static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT;
static int __init set_nohugeiomap(char *str) { - ioremap_huge_disabled = 1; + iomap_max_page_shift = P4D_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); - -void __init ioremap_huge_init(void) -{ - if (!ioremap_huge_disabled) { - if (arch_ioremap_p4d_supported()) - ioremap_p4d_capable = 1; - if (arch_ioremap_pud_supported()) - ioremap_pud_capable = 1; - if (arch_ioremap_pmd_supported()) - ioremap_pmd_capable = 1; - } -} - -static inline int ioremap_p4d_enabled(void) -{ - return ioremap_p4d_capable; -} - -static inline int ioremap_pud_enabled(void) -{ - return ioremap_pud_capable; -} - -static inline int ioremap_pmd_enabled(void) -{ - return ioremap_pmd_capable; -} - -#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static inline int ioremap_p4d_enabled(void) { return 0; } -static inline int ioremap_pud_enabled(void) { return 0; } -static inline int ioremap_pmd_enabled(void) { return 0; } +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static const bool iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, @@ -79,9 +46,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, }
static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { - if (!ioremap_pmd_enabled()) + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) return 0;
if ((end - addr) != PMD_SIZE) @@ -97,7 +68,8 @@ static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, }
static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { pmd_t *pmd; unsigned long next; @@ -108,7 +80,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end);
- if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) continue;
if (vmap_pte_range(pmd, addr, next, phys_addr, prot)) @@ -118,9 +90,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, }
static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { - if (!ioremap_pud_enabled()) + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) return 0;
if ((end - addr) != PUD_SIZE) @@ -136,7 +112,8 @@ static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, }
static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { pud_t *pud; unsigned long next; @@ -147,19 +124,23 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, do { next = pud_addr_end(addr, end);
- if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) continue;
- if (vmap_pmd_range(pud, addr, next, phys_addr, prot)) + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { - if (!ioremap_p4d_enabled()) + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) return 0;
if ((end - addr) != P4D_SIZE) @@ -175,7 +156,8 @@ static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, }
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { p4d_t *p4d; unsigned long next; @@ -186,17 +168,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, do { next = p4d_addr_end(addr, end);
- if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) continue;
- if (vmap_pud_range(p4d, addr, next, phys_addr, prot)) + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; }
static int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) { pgd_t *pgd; unsigned long start; @@ -210,7 +193,7 @@ static int vmap_range(unsigned long addr, unsigned long end, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, phys_addr, prot); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); @@ -223,5 +206,5 @@ static int vmap_range(unsigned long addr, unsigned long end, int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { - return vmap_range(addr, end, phys_addr, prot); + return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift); }
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-8-npiggin@gmail.com/ --------------
This allows unsupported levels to be constant folded away, and so p4d_free_pud_page can be removed because it's no longer linked to.
Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/vmalloc.h | 23 ++++++++++++++++++++--- arch/arm64/mm/mmu.c | 26 -------------------------- 2 files changed, 20 insertions(+), 29 deletions(-)
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 597b40405319..fc9a12d6cc1a 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -4,9 +4,26 @@ #include <asm/page.h>
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -bool arch_vmap_p4d_supported(pgprot_t prot); -bool arch_vmap_pud_supported(pgprot_t prot); -bool arch_vmap_pmd_supported(pgprot_t prot); +static inline bool arch_vmap_p4d_supported(pgprot_t prot) +{ + return false; +} + +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ + /* + * Only 4k granule supports level 1 block mappings. + * SW table walks can't handle removal of intermediate entries. + */ + return IS_ENABLED(CONFIG_ARM64_4K_PAGES) && + !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); +} + +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + /* See arch_vmap_pud_supported() */ + return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); +} #endif
#endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index cc44645a997b..300972b01939 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -921,27 +921,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; }
-bool arch_vmap_p4d_supported(pgprot_t prot) -{ - return false; -} - -bool arch_vmap_pud_supported(pgprot_t prot) -{ - /* - * Only 4k granule supports level 1 block mappings. - * SW table walks can't handle removal of intermediate entries. - */ - return IS_ENABLED(CONFIG_ARM64_4K_PAGES) && - !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS); -} - -bool arch_vmap_pmd_supported(pgprot_t prot) -{ - /* See arch_vmap_pud_supported() */ - return !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS); -} - int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT | @@ -1041,11 +1020,6 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) return 1; }
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) -{ - return 0; /* Don't attempt a block mapping */ -} - #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock)
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-10-npiggin@gmail.com/ --------------
This is a generic kernel virtual memory mapper, not specific to ioremap.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/vmalloc.h | 3 + mm/ioremap.c | 176 ---------------------------------------- mm/vmalloc.c | 175 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 178 insertions(+), 176 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6cbbebc97ea8..496ac80046c0 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -147,6 +147,9 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr);
#ifdef CONFIG_MMU +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift); extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, diff --git a/mm/ioremap.c b/mm/ioremap.c index e4c5044b1ab2..8b1905bf03d2 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -27,182 +27,6 @@ early_param("nohugeiomap", set_nohugeiomap); static const bool iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) -{ - pte_t *pte; - u64 pfn; - - pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel(pmd, addr); - if (!pte) - return -ENOMEM; - do { - BUG_ON(!pte_none(*pte)); - set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); - return 0; -} - -static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - if (max_page_shift < PMD_SHIFT) - return 0; - - if (!arch_vmap_pmd_supported(prot)) - return 0; - - if ((end - addr) != PMD_SIZE) - return 0; - - if (!IS_ALIGNED(phys_addr, PMD_SIZE)) - return 0; - - if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) - return 0; - - return pmd_set_huge(pmd, phys_addr, prot); -} - -static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_alloc(&init_mm, pud, addr); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - - if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) - continue; - - if (vmap_pte_range(pmd, addr, next, phys_addr, prot)) - return -ENOMEM; - } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - if (max_page_shift < PUD_SHIFT) - return 0; - - if (!arch_vmap_pud_supported(prot)) - return 0; - - if ((end - addr) != PUD_SIZE) - return 0; - - if (!IS_ALIGNED(phys_addr, PUD_SIZE)) - return 0; - - if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) - return 0; - - return pud_set_huge(pud, phys_addr, prot); -} - -static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - pud_t *pud; - unsigned long next; - - pud = pud_alloc(&init_mm, p4d, addr); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - - if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) - continue; - - if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift)) - return -ENOMEM; - } while (pud++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - if (max_page_shift < P4D_SHIFT) - return 0; - - if (!arch_vmap_p4d_supported(prot)) - return 0; - - if ((end - addr) != P4D_SIZE) - return 0; - - if (!IS_ALIGNED(phys_addr, P4D_SIZE)) - return 0; - - if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) - return 0; - - return p4d_set_huge(p4d, phys_addr, prot); -} - -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - p4d_t *p4d; - unsigned long next; - - p4d = p4d_alloc(&init_mm, pgd, addr); - if (!p4d) - return -ENOMEM; - do { - next = p4d_addr_end(addr, end); - - if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) - continue; - - if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift)) - return -ENOMEM; - } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) -{ - pgd_t *pgd; - unsigned long start; - unsigned long next; - int err; - - might_sleep(); - BUG_ON(addr >= end); - - start = addr; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift); - if (err) - break; - } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); - - flush_cache_vmap(start, end); - - return err; -} - int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c0dd884d65cd..b9bc49565252 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -59,6 +59,181 @@ static void free_work(struct work_struct *w) }
/*** Page table manipulation functions ***/ +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) +{ + pte_t *pte; + u64 pfn; + + pfn = phys_addr >> PAGE_SHIFT; + pte = pte_alloc_kernel(pmd, addr); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + return 0; +} + +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) + continue; + + if (vmap_pte_range(pmd, addr, next, phys_addr, prot)) + return -ENOMEM; + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) + continue; + + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift)) + return -ENOMEM; + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) + continue; + + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift)) + return -ENOMEM; + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + flush_cache_vmap(start, end); + + return err; +}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) {
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-11-npiggin@gmail.com/ --------------
As a side-effect, the order of flush_cache_vmap() and arch_sync_kernel_mappings() calls are switched, but that now matches the other callers in this file.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b9bc49565252..dd39b64deeef 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -209,7 +209,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, return 0; }
-int vmap_range(unsigned long addr, unsigned long end, +static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { @@ -230,7 +230,17 @@ int vmap_range(unsigned long addr, unsigned long end, break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
- flush_cache_vmap(start, end); + return err; +} + +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + int err; + + err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); + flush_cache_vmap(addr, end);
return err; }
From: Nicholas Piggin <npiggin-AT-gmail.com>
ascend inclusion category: feature bugzilla: NA CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-12-npiggin@gmail.com/
Don't distinction between vmalloc and hugepage vmalloc, because there is no size print in alloc_large_system_hash in v4.19.
And this patch add page_order in vm_struct, it will break kabi. --------------
Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC enables support on architectures that define HAVE_ARCH_HUGE_VMAP and supports PMD sized vmap mappings.
vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or larger, and fall back to small pages if that was unsuccessful.
Allocations that do not use PAGE_KERNEL prot are not permitted to use huge pages, because not all callers expect this (e.g., module allocations vs strict module rwx).
This reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
This can result in more internal fragmentation and memory overhead for a given allocation, an option nohugevmalloc is added to disable at boot.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/Kconfig | 4 + include/linux/vmalloc.h | 1 + mm/vmalloc.c | 160 +++++++++++++++++++++++++++++++--------- 3 files changed, 130 insertions(+), 35 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig index d3d70369bf9c..deb8c405b4ae 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -528,6 +528,10 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD config HAVE_ARCH_HUGE_VMAP bool
+config HAVE_ARCH_HUGE_VMALLOC + depends on HAVE_ARCH_HUGE_VMAP + bool + config HAVE_ARCH_SOFT_DIRTY bool
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 496ac80046c0..07b4b1141ed8 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -39,6 +39,7 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; + unsigned int page_order; unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dd39b64deeef..2fec803edc90 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -41,6 +41,19 @@
#include "internal.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + struct vfree_deferred { struct llist_head list; struct work_struct wq; @@ -410,6 +423,61 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; }
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + int nr = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr); + if (err) + return err; + } while (pgd++, addr = next, addr != end); + + return 0; +} + +static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + /** * map_kernel_range_noflush - map kernel VM area with the specified pages * @addr: start of the VM area to map @@ -431,22 +499,7 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, int map_kernel_range_noflush(unsigned long addr, unsigned long size, pgprot_t prot, struct page **pages) { - unsigned long end = addr + size; - unsigned long next; - pgd_t *pgd; - int err = 0; - int nr = 0; - - BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, end); - err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr); - if (err) - return err; - } while (pgd++, addr = next, addr != end); - - return 0; + return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT); }
int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, @@ -2270,11 +2323,11 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { int i;
- for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { struct page *page = area->pages[i];
BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, area->page_order); }
kvfree(area->pages); @@ -2403,9 +2456,12 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, int node) { struct page **pages; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); + unsigned int page_order = page_shift - PAGE_SHIFT; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; @@ -2413,7 +2469,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 0 : __GFP_HIGHMEM;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *));
/* Please note that the recursion is strictly bounded. */ @@ -2432,27 +2488,27 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->pages = pages; area->nr_pages = nr_pages; + area->page_order = page_order;
- for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page; + int p;
- if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask|highmem_mask); - else - page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); - + page = alloc_pages_node(node, alloc_mask|highmem_mask, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); }
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) goto fail;
return area->addr; @@ -2460,7 +2516,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); + (area->nr_pages*PAGE_SIZE), size); vfree(area->addr); return NULL; } @@ -2489,19 +2545,42 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail;
+ if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) { + unsigned long size_per_node; + + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + size = PAGE_ALIGN(size); area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail;
- addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail;
/* * First make sure the mappings are removed from all page-tables @@ -2521,8 +2600,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr;
fail: - warn_alloc(gfp_mask, NULL, + if (shift > PAGE_SHIFT) { + free_vm_area(area); + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + + if (!area) { + /* Warn for area allocation, page allocations already warn */ + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); + } return NULL; }
@@ -3501,7 +3591,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " %pS", v->caller);
if (v->nr_pages) - seq_printf(m, " pages=%d", v->nr_pages); + seq_printf(m, " pages=%d order=%d", v->nr_pages, v->page_order);
if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr);