From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Make efi_print_memmap() public in preparation for adding fake memory support for architecture with efi support, eg, arm64.
Co-developed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/efi.h | 1 - arch/x86/platform/efi/efi.c | 16 ---------------- drivers/firmware/efi/memmap.c | 16 ++++++++++++++++ include/linux/efi.h | 1 + 4 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index bc9758ef292e..3be8754408d5 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -138,7 +138,6 @@ struct efi_scratch {
extern struct efi_scratch efi_scratch; extern int __init efi_memblock_x86_reserve_range(void); -extern void __init efi_print_memmap(void); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 8a26e705cb06..ef6f4cbffe28 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -323,22 +323,6 @@ static void __init efi_clean_memmap(void) } }
-void __init efi_print_memmap(void) -{ - efi_memory_desc_t *md; - int i = 0; - - for_each_efi_memory_desc(md) { - char buf[64]; - - pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", - i++, efi_md_typeattr_format(buf, sizeof(buf), md), - md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, - (md->num_pages >> (20 - EFI_PAGE_SHIFT))); - } -} - static int __init efi_systab_init(unsigned long phys) { int size = efi_enabled(EFI_64BIT) ? sizeof(efi_system_table_64_t) diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index 2ff1883dc788..0155bf066ba5 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -376,3 +376,19 @@ void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, } } } + +void __init efi_print_memmap(void) +{ + efi_memory_desc_t *md; + int i = 0; + + for_each_efi_memory_desc(md) { + char buf[64]; + + pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", + i++, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, + (md->num_pages >> (20 - EFI_PAGE_SHIFT))); + } +} diff --git a/include/linux/efi.h b/include/linux/efi.h index e17cd4c44f93..280f36cb7c14 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -643,6 +643,7 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md, struct range *range); extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, struct efi_mem_range *mem); +extern void __init efi_print_memmap(void);
#ifdef CONFIG_EFI_ESRT extern void __init efi_esrt_init(void);
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Fake memory map is used for faking memory's attribute values. Commit 0f96a99dab36 ("efi: Add "efi_fake_mem" boot option") introduce the efi_fake_mem function. Now it can support arm64 with this patch. For example you can mark 0-6G memory as EFI_MEMORY_MORE_RELIABLE by adding efi_fake_mem=6G@0:0x10000 in the bootarg. You find more info about fake memmap in kernel-parameters.txt.
Variable memstart_addr is only confirmed after arm64_memblock_init(). So efi_fake_memmap() is needed to add after arm64_memblock_init().
Otherwise:
efi_memmap_alloc memblock_phys_alloc kmemleak_alloc_phys kmemleak_alloc(__va(phys), size, min_count, gfp);
this __va() will convert phys to a fault va and lead to a kmemleak error.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/arm64/kernel/setup.c | 2 ++ drivers/firmware/efi/Kconfig | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a4e5614bee12..a0f400650030 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1290,7 +1290,7 @@ you are really sure that your UEFI does sane gc and fulfills the spec otherwise your board may brick.
- efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86] + efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86; ARM64] Add arbitrary attribute to specific memory range by updating original EFI memory map. Region of memory which aa attribute is added to is diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 5e0713f5120e..517b230445be 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -387,6 +387,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
arm64_memblock_init();
+ efi_fake_memmap(); + paging_init();
acpi_table_upgrade(); diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index c196b7ef6a2a..825d6619ded6 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -50,7 +50,7 @@ config EFI_RUNTIME_MAP
config EFI_FAKE_MEMMAP bool "Enable EFI fake memory map" - depends on EFI && X86 + depends on EFI && (X86 || ARM64) default n help Saying Y here will enable "efi_fake_mem" boot option.
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Commit b05b9f5f9dcf ("x86, mirror: x86 enabling - find mirrored memory ranges") introduce the efi_find_mirror function on x86. In order to reuse the API we make it public in preparation for arm64 to support mirrord memory.
Co-developed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/platform/efi/efi.c | 23 ----------------------- drivers/firmware/efi/efi.c | 23 +++++++++++++++++++++++ include/linux/efi.h | 3 +++ 3 files changed, 26 insertions(+), 23 deletions(-)
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ef6f4cbffe28..b7cf79b818d0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -105,29 +105,6 @@ static int __init setup_add_efi_memmap(char *arg) } early_param("add_efi_memmap", setup_add_efi_memmap);
-void __init efi_find_mirror(void) -{ - efi_memory_desc_t *md; - u64 mirror_size = 0, total_size = 0; - - if (!efi_enabled(EFI_MEMMAP)) - return; - - for_each_efi_memory_desc(md) { - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - - total_size += size; - if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { - memblock_mark_mirror(start, size); - mirror_size += size; - } - } - if (mirror_size) - pr_info("Memory: %lldM/%lldM mirrored memory\n", - mirror_size>>20, total_size>>20); -} - /* * Tell the kernel about the EFI memory map. This might include * more than the max 128 entries that can fit in the passed in e820 diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 4cd03ab9a5a6..555a52ead131 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -435,6 +435,29 @@ static int __init efisubsys_init(void)
subsys_initcall(efisubsys_init);
+void __init efi_find_mirror(void) +{ + efi_memory_desc_t *md; + u64 mirror_size = 0, total_size = 0; + + if (!efi_enabled(EFI_MEMMAP)) + return; + + for_each_efi_memory_desc(md) { + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + total_size += size; + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + memblock_mark_mirror(start, size); + mirror_size += size; + } + } + if (mirror_size) + pr_info("Memory: %lldM/%lldM mirrored memory\n", + mirror_size>>20, total_size>>20); +} + /* * Find the efi memory descriptor for a given physical address. Given a * physical address, determine if it exists within an EFI Memory Map entry, diff --git a/include/linux/efi.h b/include/linux/efi.h index 280f36cb7c14..395a0e344ffd 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -812,6 +812,7 @@ static inline bool efi_rt_services_supported(unsigned int mask) { return (efi.runtime_supported_mask & mask) == mask; } +extern void efi_find_mirror(void); #else static inline bool efi_enabled(int feature) { @@ -835,6 +836,8 @@ static inline bool efi_rt_services_supported(unsigned int mask) { return false; } + +static inline void efi_find_mirror(void) {} #endif
extern int efi_status_to_err(efi_status_t status);
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Mirrored memory could be used on HiSilion's arm64 SoC. So efi_find_mirror() is added in efi_init() so that systems can get memblock about any mirrored ranges.
Co-developed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/arm64/kernel/setup.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a0f400650030..50abc29d8366 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2226,7 +2226,7 @@
keepinitrd [HW,ARM]
- kernelcore= [KNL,X86,IA-64,PPC] + kernelcore= [KNL,X86,IA-64,PPC,ARM64] Format: nn[KMGTPE] | nn% | "mirror" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 517b230445be..3c834d7c299a 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -388,6 +388,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) arm64_memblock_init();
efi_fake_memmap(); + efi_find_mirror();
paging_init();
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
If system has mirrored memory, memblock will try to allocate mirrored memory firstly and fallback to non-mirrored memory when fails, but if with limited mirrored memory or some numa node without mirrored memory, lots of warning message about memblock allocation will occur.
This patch ratelimit the warning message to avoid a very long print during bootup.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/memblock.c b/mm/memblock.c index 7b25c54ab5c8..873625fdc504 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -327,7 +327,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, NUMA_NO_NODE, flags);
if (!ret && (flags & MEMBLOCK_MIRROR)) { - pr_warn("Could not allocate %pap bytes of mirrored memory\n", + pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", &size); flags &= ~MEMBLOCK_MIRROR; goto again; @@ -1359,7 +1359,7 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
if (flags & MEMBLOCK_MIRROR) { flags &= ~MEMBLOCK_MIRROR; - pr_warn("Could not allocate %pap bytes of mirrored memory\n", + pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", &size); goto again; }
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
For a system only have limited mirrored memory or some numa node without mirrored memory, the per node vmemmap page_structs prefer to allocate memory from mirrored region, which will lead to vmemmap_verify() report lots of warning message.
This patch demote the "potential offnode page_structs" warning messages to debug level to avoid a very long print during bootup.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/sparse-vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bdce883f9286..396a49462894 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -490,7 +490,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE) - pr_warn("[%lx-%lx] potential offnode page_structs\n", + pr_debug("[%lx-%lx] potential offnode page_structs\n", start, end - 1); }
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
With this patch, kernel will check mirrored_kernelcore before calling efi_find_mirror() which will enable basic mirrored feature.
If system have some mirrored memory and mirrored feature is not specified in boot parameter, the basic mirrored feature will be enabled and this will lead to the following situations:
- memblock memory allocation perfers mirrored region. This may have some unexpected influence on numa affinity.
- contiguous memory will be splited into server parts if parts of them is mirrored memroy via memblock_mark_mirror().
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/firmware/efi/efi.c | 3 +++ include/linux/mm.h | 2 ++ mm/page_alloc.c | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 555a52ead131..098d8796113a 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -443,6 +443,9 @@ void __init efi_find_mirror(void) if (!efi_enabled(EFI_MEMMAP)) return;
+ if (!mirrored_kernelcore) + return; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/include/linux/mm.h b/include/linux/mm.h index 2db253209f81..52c4ac088b88 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2473,6 +2473,8 @@ extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void);
+extern bool mirrored_kernelcore; + #ifndef CONFIG_NEED_MULTIPLE_NODES static inline int early_pfn_to_nid(unsigned long pfn) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef761d5c7025..7ea4531cc557 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -376,7 +376,7 @@ static unsigned long required_kernelcore_percent __initdata; static unsigned long required_movablecore __initdata; static unsigned long required_movablecore_percent __initdata; static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; -static bool mirrored_kernelcore __meminitdata; +bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone;
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Introduction
============
Memory reliable feature is a memory tiering mechanism. It is based on kernel mirror feature, which splits memory into two sperate regions, mirrored(reliable) region and non-mirrored (non-reliable) region.
for kernel mirror feature:
- allocate kernel memory from mirrored region by default - allocate user memory from non-mirrored region by default
non-mirrored region will be arranged into ZONE_MOVABLE.
for kernel reliable feature, it has additional features below:
- normal user tasks never alloc memory from mirrored region with userspace apis(malloc, mmap, etc.) - special user tasks will allocate memory from mirrored region by default - tmpfs/pagecache allocate memory from mirrored region by default - upper limit of mirrored region allcated for user tasks, tmpfs and pagecache
Support Reliable fallback mechanism which allows special user tasks, tmpfs and pagecache can fallback to alloc non-mirrored region, it's the default setting.
In order to fulfil the goal
- ___GFP_RELIABLE flag added for alloc memory from mirrored region.
- the high_zoneidx for special user tasks/tmpfs/pagecache is set to ZONE_NORMAL.
- normal user tasks could only alloc from ZONE_MOVABLE.
This patch is just the main framework, memory reliable support for special user tasks, pagecache and tmpfs has own patches.
To enable this function, mirrored(reliable) memory is needed and "kernelcore=reliable" should be added to kernel parameters.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 5 +- include/linux/gfp.h | 15 +++- include/linux/mem_reliable.h | 62 +++++++++++++++ include/linux/mm.h | 3 + include/trace/events/mmflags.h | 2 +- mm/Kconfig | 18 +++++ mm/Makefile | 1 + mm/mem_reliable.c | 79 +++++++++++++++++++ mm/page_alloc.c | 31 +++++++- tools/perf/builtin-kmem.c | 2 +- 10 files changed, 211 insertions(+), 7 deletions(-) create mode 100644 include/linux/mem_reliable.h create mode 100644 mm/mem_reliable.c
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 50abc29d8366..b5524464f1cb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2227,7 +2227,7 @@ keepinitrd [HW,ARM]
kernelcore= [KNL,X86,IA-64,PPC,ARM64] - Format: nn[KMGTPE] | nn% | "mirror" + Format: nn[KMGTPE] | nn% | "mirror" | "reliable" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested amount is spread evenly throughout all nodes in the @@ -2251,6 +2251,9 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms.
+ Option "reliable" is base on option "mirror", but make + some extension. These two features are alternatives. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 80efbea0c9d7..5c3df92a4745 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,7 +39,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x100000u #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u -#define ___GFP_RESERVE_0 0x800000u +#define ___GFP_RELIABLE 0x800000u #define ___GFP_RESERVE_1 0x1000000u #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x2000000u @@ -225,8 +225,10 @@ struct vm_area_struct; /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
-/* Reserve 2 flags for future usage */ -#define __GFP_RESERVE_0 ((__force gfp_t)___GFP_RESERVE_0) +/* Alloc memory from mirrored region */ +#define __GFP_RELIABLE ((__force gfp_t)___GFP_RELIABLE) + +/* Reserve 1 flags for future usage */ #define __GFP_RESERVE_1 ((__force gfp_t)___GFP_RESERVE_1)
/* Room for N __GFP_FOO bits */ @@ -315,6 +317,7 @@ struct vm_area_struct; #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) +#define GFP_RELIABLE __GFP_RELIABLE
/* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -461,6 +464,12 @@ static inline enum zone_type gfp_zone(gfp_t flags) z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + +#ifdef CONFIG_MEMORY_RELIABLE + if (z == ZONE_MOVABLE && (flags & GFP_RELIABLE)) + return ZONE_NORMAL; +#endif + return z; }
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h new file mode 100644 index 000000000000..02f73a91058b --- /dev/null +++ b/include/linux/mem_reliable.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_MEM_RELIABLE__ +#define __MM_MEM_RELIABLE__ + +#include <linux/stddef.h> +#include <linux/gfp.h> +#include <linux/mmzone.h> +#include <linux/mm_types.h> +#include <linux/sched.h> + +#ifdef CONFIG_MEMORY_RELIABLE + +extern struct static_key_false mem_reliable; + +extern bool reliable_enabled; + +extern void add_reliable_mem_size(long sz); +extern void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn); + +static inline bool mem_reliable_is_enabled(void) +{ + return static_branch_likely(&mem_reliable); +} + +static inline bool zone_reliable(struct zone *zone) +{ + return mem_reliable_is_enabled() && zone_idx(zone) < ZONE_MOVABLE; +} + +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!current->mm || (current->flags & PF_KTHREAD)) + return false; + + /* user tasks can only alloc memory from non-mirrored region */ + if (!(gfp & GFP_RELIABLE) && (gfp & __GFP_HIGHMEM) && + (gfp & __GFP_MOVABLE)) { + if (zonelist_zone_idx(z) < ZONE_MOVABLE) + return true; + } + + return false; +} +#else +#define reliable_enabled 0 + +static inline bool mem_reliable_is_enabled(void) { return false; } +static inline void add_reliable_mem_size(long sz) {} +static inline void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn) {} +static inline bool zone_reliable(struct zone *zone) { return false; } +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + return false; +} +#endif + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 52c4ac088b88..859d5200c57b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,6 +34,9 @@ #include <linux/pgtable.h> #include <linux/kabi.h>
+/* added to mm.h to avoid every caller adding new header file */ +#include <linux/mem_reliable.h> + struct mempolicy; struct anon_vma; struct anon_vma_chain; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index fba14499b87e..dc1805fbf893 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -49,7 +49,7 @@ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_RESERVE_0, "__GFP_RESERVE_0"}, \ + {(unsigned long)__GFP_RELIABLE, "__GFP_RELIABLE"}, \ {(unsigned long)__GFP_RESERVE_1, "__GFP_RESERVE_1"} \
#define show_gfp_flags(flags) \ diff --git a/mm/Kconfig b/mm/Kconfig index 1ba477dee3ae..4475bd9f8762 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -951,6 +951,24 @@ config PID_RESERVE and reserve the pids before init task start. In restore process, free the reserved pids and realloc them for use.
+config MEMORY_RELIABLE + bool "Support for memory reliable" + depends on NEED_MULTIPLE_NODES + depends on ARM64 + default n + help + Memory reliable is based on mirror memory. It has the following + additional features: + a) normal user tasks never alloc memory from mirrored region; + b) special user tasks will allocate memory from mirrored region + by default; c) upper limit of mirrored region allcated for user + tasks, tmpfs and pagecache. + Special user tasks and tmpfs/pagecache can fallback to + non-mirrored region if you enable reliable fallback mechanism. + + To enable this function, mirrored memory is needed and + "kernelcore=reliable" need to be added in kernel parameters. + source "mm/damon/Kconfig"
endmenu diff --git a/mm/Makefile b/mm/Makefile index 4b0b5e7af40f..4b3a827429f3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -128,3 +128,4 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_SHRINK_PAGECACHE) += page_cache_limit.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o +obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c new file mode 100644 index 000000000000..7914c76c1fcd --- /dev/null +++ b/mm/mem_reliable.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "mem reliable: " fmt + +#include <linux/mm.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> + +DEFINE_STATIC_KEY_FALSE(mem_reliable); + +bool reliable_enabled; + +static atomic_long_t total_reliable_mem; + +void add_reliable_mem_size(long sz) +{ + atomic_long_add(sz, &total_reliable_mem); +} + +static unsigned long total_reliable_mem_sz(void) +{ + return atomic_long_read(&total_reliable_mem); +} + +static int reliable_mem_notifier(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct memory_notify *m_arg = arg; + struct zone *zone; + + switch (action) { + case MEM_ONLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(m_arg->nr_pages * PAGE_SIZE); + break; + case MEM_OFFLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(-m_arg->nr_pages * PAGE_SIZE); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block reliable_notifier_block = { + .notifier_call = reliable_mem_notifier, +}; + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) +{ + if (!reliable_enabled) + return; + + if (atomic_long_read(&total_reliable_mem) == 0) { + memset(zone_movable_pfn, 0, + sizeof(unsigned long) * MAX_NUMNODES); + pr_err("init failed, mirrored memory size is zero.\n"); + return; + } + + if (!has_unmirrored_mem) { + pr_err("init failed, unmirrored memory size is zero.\n"); + return; + } + + if (register_hotmemory_notifier(&reliable_notifier_block)) { + pr_err("init failed, register memory notifier failed.\n"); + return; + } + + static_branch_enable(&mem_reliable); + + pr_info("init succeed, mirrored memory size(%lu)\n", + total_reliable_mem_sz()); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ea4531cc557..bfe215fc0da1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3840,6 +3840,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct page *page; unsigned long mark;
+ /* skip non-movable zone for normal user tasks */ + if (skip_none_movable_zone(gfp_mask, z)) + continue; + /* * CDM nodes get skipped if the requested gfp flag * does not have __GFP_THISNODE set or the nodemask @@ -7494,10 +7498,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + bool has_unmirrored_mem = false;
for_each_mem_region(r) { - if (memblock_is_mirror(r)) + if (memblock_is_mirror(r)) { + add_reliable_mem_size(r->size); continue; + }
nid = memblock_get_region_node(r);
@@ -7508,6 +7515,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) continue; }
+ has_unmirrored_mem = true; zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -7516,6 +7524,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory.\n");
+ mem_reliable_init(has_unmirrored_mem, zone_movable_pfn); + goto out2; }
@@ -7827,10 +7837,29 @@ static int __init cmdline_parse_kernelcore(char *p) { /* parse kernelcore=mirror */ if (parse_option_str(p, "mirror")) { + if (reliable_enabled) { + pr_info("kernelcore=reliable and kernelcore=mirror are alternative.\n"); + return -EINVAL; + } + mirrored_kernelcore = true; return 0; }
+#ifdef CONFIG_MEMORY_RELIABLE + /* parse kernelcore=reliable */ + if (parse_option_str(p, "reliable")) { + if (!reliable_enabled && mirrored_kernelcore) { + pr_info("kernelcore=mirror and kernelcore=reliable are alternative.\n"); + return -EINVAL; + } + + reliable_enabled = true; + mirrored_kernelcore = true; + return 0; + } +#endif + return cmdline_parse_core(p, &required_kernelcore, &required_kernelcore_percent); } diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index ffe9cf4160cf..5f4cc7126688 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -660,7 +660,7 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, - { "__GFP_RESERVE_0", "RE0" }, + { "__GFP_RELIABLE", "REL" }, { "__GFP_RESERVE_1", "RE1" }, };
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Add ReliMemTotal & ReliMemUsed in /proc/meminfo to show memory info about reliable memory.
- ReliableTotal: total reliable RAM
- ReliableUsed: thei used amount of reliable memory kernel
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/filesystems/proc.rst | 6 ++++++ fs/proc/meminfo.c | 2 ++ include/linux/mem_reliable.h | 2 ++ mm/mem_reliable.c | 25 +++++++++++++++++++++++++ 4 files changed, 35 insertions(+)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 533c79e8d2cd..e1562bef4887 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -963,6 +963,8 @@ varies by architecture and compile options. The following is from a AnonHugePages: 49152 kB ShmemHugePages: 0 kB ShmemPmdMapped: 0 kB + ReliableTotal: 7340032 kB + ReliableUsed: 418824 kB
MemTotal Total usable RAM (i.e. physical RAM minus a few reserved @@ -1092,6 +1094,10 @@ VmallocChunk Percpu Memory allocated to the percpu allocator used to back percpu allocations. This stat excludes the cost of metadata. +ReliableTotal + Total reliable memory size +ReliableUsed + The used amount of reliable memory
vmallocinfo ~~~~~~~~~~~ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 887a5532e449..e03212ddaddf 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -150,6 +150,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
arch_report_meminfo(m);
+ reliable_report_meminfo(m); + return 0; }
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 02f73a91058b..4add3803eb06 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -17,6 +17,7 @@ extern bool reliable_enabled; extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn); +extern void reliable_report_meminfo(struct seq_file *m);
static inline bool mem_reliable_is_enabled(void) { @@ -57,6 +58,7 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { return false; } +static inline void reliable_report_meminfo(struct seq_file *m) {} #endif
#endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 7914c76c1fcd..aa89c874890e 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -5,6 +5,8 @@ #include <linux/mm.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/seq_file.h> +#include <linux/mmzone.h>
DEFINE_STATIC_KEY_FALSE(mem_reliable);
@@ -22,6 +24,18 @@ static unsigned long total_reliable_mem_sz(void) return atomic_long_read(&total_reliable_mem); }
+static unsigned long used_reliable_mem_sz(void) +{ + unsigned long nr_page = 0; + struct zone *z; + + for_each_populated_zone(z) + if (zone_idx(z) < ZONE_MOVABLE) + nr_page += zone_page_state(z, NR_FREE_PAGES); + + return total_reliable_mem_sz() - nr_page * PAGE_SIZE; +} + static int reliable_mem_notifier(struct notifier_block *nb, unsigned long action, void *arg) { @@ -77,3 +91,14 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) pr_info("init succeed, mirrored memory size(%lu)\n", total_reliable_mem_sz()); } + +void reliable_report_meminfo(struct seq_file *m) +{ + if (!mem_reliable_is_enabled()) + return; + + seq_printf(m, "ReliableTotal: %8lu kB\n", + total_reliable_mem_sz() >> 10); + seq_printf(m, "ReliableUsed: %8lu kB\n", + used_reliable_mem_sz() >> 10); +}
From: Peng Wu wupeng58@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
------------------------------------------
Adding reliable flag for user task. User task with reliable flag can only alloc memory from mirrored region. PF_RELIABLE is added to represent the task's reliable flag.
- For init task, which is regarded as special task which alloc memory from mirrored region.
- For normal user tasks, The reliable flag can be set via procfs interface shown as below and can be inherited via fork().
User can change a user task's reliable flag by
$ echo [0/1] > /proc/<pid>/reliable
and check a user task's reliable flag by
$ cat /proc/<pid>/reliable
Note, global init task's reliable file can not be accessed.
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/filesystems/proc.rst | 6 ++ fs/proc/base.c | 96 ++++++++++++++++++++++++++++++ include/linux/sched.h | 1 + mm/page_alloc.c | 12 ++++ 4 files changed, 115 insertions(+)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index e1562bef4887..f6783bb99e3f 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -162,6 +162,8 @@ usually fail with ESRCH. can be derived from smaps, but is faster and more convenient numa_maps An extension based on maps, showing the memory locality and binding policy as well as mem usage (in pages) of each mapping. + reliable Present with CONFIG_MEMORY_RELIABLE=y. Process reliable status + information ============= ===============================================================
For example, to get the status information of a process, all you have to do is @@ -649,6 +651,10 @@ Where: node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page size, in KB, that is backing the mapping up.
+The /proc/pid/reliable is used to control user process's reliable status. +Process with this flag can only alloc memory from mirrored region. Global +init task's reliable flag can not be accessed. + 1.2 Kernel data ---------------
diff --git a/fs/proc/base.c b/fs/proc/base.c index 7edbfd2ef757..2ba1313aa444 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1260,6 +1260,96 @@ static const struct file_operations proc_oom_score_adj_operations = { .llseek = default_llseek, };
+#ifdef CONFIG_MEMORY_RELIABLE +static inline int reliable_check(struct task_struct *task, struct pid *pid) +{ + if (!mem_reliable_is_enabled()) + return -EPERM; + + if (is_global_init(task)) + return -EPERM; + + if (!task->mm || (task->flags & PF_KTHREAD) || + (task->flags & PF_EXITING)) + return -EPERM; + + return 0; +} + +static ssize_t reliable_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct pid *pid = proc_pid(file_inode(file)); + char buffer[PROC_NUMBUF]; + size_t len; + short val; + int err; + + if (!task) + return -ESRCH; + + err = reliable_check(task, pid); + if (err) { + put_task_struct(task); + return err; + } + + val = task->flags & PF_RELIABLE ? 1 : 0; + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%hd\n", val); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t reliable_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct pid *pid = proc_pid(file_inode(file)); + char buffer[PROC_NUMBUF]; + int val; + int err; + + if (!task) + return -ESRCH; + + err = reliable_check(task, pid); + if (err) + goto out; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &val); + if (err) + goto out; + if (val != 0 && val != 1) { + err = -EINVAL; + goto out; + } + + if (val == 1) + task->flags |= PF_RELIABLE; + else + task->flags &= ~PF_RELIABLE; + +out: + put_task_struct(task); + return err < 0 ? err : count; +} + +static const struct file_operations proc_reliable_operations = { + .read = reliable_read, + .write = reliable_write, + .llseek = generic_file_llseek, +}; +#endif + #ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, @@ -3262,6 +3352,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_MEMORY_RELIABLE + REG("reliable", S_IRUGO|S_IWUSR, proc_reliable_operations), +#endif #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), @@ -3609,6 +3702,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_MEMORY_RELIABLE + REG("reliable", S_IRUGO|S_IWUSR, proc_reliable_operations), +#endif #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), diff --git a/include/linux/sched.h b/include/linux/sched.h index c33f7a70629a..edd236f98f0c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1581,6 +1581,7 @@ extern struct pid *cad_pid; #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ +#define PF_RELIABLE 0x00000008 /* Allocate from reliable memory */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bfe215fc0da1..a72df34fa210 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5140,6 +5140,15 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, } EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+static inline void prepare_before_alloc(gfp_t *gfp_mask) +{ + if (!mem_reliable_is_enabled()) + return; + + if ((current->flags & PF_RELIABLE) || is_global_init(current)) + *gfp_mask |= GFP_RELIABLE; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -5161,6 +5170,9 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, }
gfp &= gfp_allowed_mask; + + prepare_before_alloc(&gfp); + alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
From: Peng Wu wupeng58@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S DTS: #896 CVE: NA
-------------------------------------------------
Reserved a variable in mm_struct while will be used by memory reliable.
Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm_types.h | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 75a9235f7aa9..1c22e294f083 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -585,6 +585,10 @@ struct mm_struct { #ifdef CONFIG_IOMMU_SUPPORT u32 pasid; #endif + +#ifdef CONFIG_MEMORY_RELIABLE + atomic_long_t reserve_0; +#endif } __randomize_layout;
KABI_RESERVE(1)
From: Zhou Guanghui zhouguanghui1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM0Y CVE: NA
------------------------------------------
This feature depends on the overall memory reliable feature. When the shared memory reliable feature is enabled, the pages used by the shared memory are allocated from the mirrored region by default. If the mirrored region is insufficient, you can allocate resources from the non-mirrored region.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mem_reliable.h | 9 +++++++++ mm/mem_reliable.c | 12 ++++++++++++ mm/shmem.c | 12 ++++++++++++ 3 files changed, 33 insertions(+)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 4add3803eb06..8f858d11ce6f 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -13,10 +13,12 @@ extern struct static_key_false mem_reliable;
extern bool reliable_enabled; +extern bool shmem_reliable;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn); +extern void shmem_reliable_init(void); extern void reliable_report_meminfo(struct seq_file *m);
static inline bool mem_reliable_is_enabled(void) @@ -46,6 +48,11 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z)
return false; } + +static inline bool shmem_reliable_is_enabled(void) +{ + return shmem_reliable; +} #else #define reliable_enabled 0
@@ -53,12 +60,14 @@ static inline bool mem_reliable_is_enabled(void) { return false; } static inline void add_reliable_mem_size(long sz) {} static inline void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) {} +static inline void shmem_reliable_init(void) {} static inline bool zone_reliable(struct zone *zone) { return false; } static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { return false; } static inline void reliable_report_meminfo(struct seq_file *m) {} +static inline bool shmem_reliable_is_enabled(void) { return false; } #endif
#endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index aa89c874890e..e977a4122f8a 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -13,6 +13,7 @@ DEFINE_STATIC_KEY_FALSE(mem_reliable); bool reliable_enabled;
static atomic_long_t total_reliable_mem; +bool shmem_reliable __read_mostly = true;
void add_reliable_mem_size(long sz) { @@ -92,6 +93,17 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) total_reliable_mem_sz()); }
+void shmem_reliable_init(void) +{ + if (!shmem_reliable_is_enabled()) + return; + + if (!mem_reliable_is_enabled()) { + shmem_reliable = false; + pr_info("shmem reliable disabled.\n"); + } +} + void reliable_report_meminfo(struct seq_file *m) { if (!mem_reliable_is_enabled()) diff --git a/mm/shmem.c b/mm/shmem.c index d36659e54542..746e48454cb8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1570,6 +1570,14 @@ static struct page *shmem_alloc_page(gfp_t gfp, return page; }
+static inline void shmem_prepare_alloc(gfp_t *gfp_mask) +{ + if (!shmem_reliable_is_enabled()) + return; + + *gfp_mask |= GFP_RELIABLE; +} + static struct page *shmem_alloc_and_acct_page(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge, int node_id) @@ -1586,6 +1594,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, if (!shmem_inode_acct_block(inode, nr)) goto failed;
+ shmem_prepare_alloc(&gfp); + if (huge) page = shmem_alloc_hugepage(gfp, info, index, node_id); else @@ -3944,6 +3954,8 @@ int __init shmem_init(void) else shmem_huge = 0; /* just in case it was patched */ #endif + + shmem_reliable_init(); return 0;
out1:
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM0Z CVE: NA
--------------------------------
__page_cache_alloc is used to alloc page cache in most file system, such as ext4, f2fs, so add GFP_RELIABLE flag to use reliable memory when alloc page cache.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mem_reliable.h | 2 ++ mm/filemap.c | 2 ++ mm/mem_reliable.c | 6 ++++++ 3 files changed, 10 insertions(+)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 8f858d11ce6f..38891cb2fa83 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -20,6 +20,7 @@ extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn); extern void shmem_reliable_init(void); extern void reliable_report_meminfo(struct seq_file *m); +extern void page_cache_prepare_alloc(gfp_t *gfp);
static inline bool mem_reliable_is_enabled(void) { @@ -68,6 +69,7 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) } static inline void reliable_report_meminfo(struct seq_file *m) {} static inline bool shmem_reliable_is_enabled(void) { return false; } +static inline void page_cache_prepare_alloc(gfp_t *gfp) {} #endif
#endif diff --git a/mm/filemap.c b/mm/filemap.c index f9e4760b9cbd..3958fc3280d8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -954,6 +954,8 @@ struct page *__page_cache_alloc(gfp_t gfp) int n; struct page *page;
+ page_cache_prepare_alloc(&gfp); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index e977a4122f8a..b1bc749532a4 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -15,6 +15,12 @@ bool reliable_enabled; static atomic_long_t total_reliable_mem; bool shmem_reliable __read_mostly = true;
+void page_cache_prepare_alloc(gfp_t *gfp) +{ + if (mem_reliable_is_enabled()) + *gfp |= GFP_RELIABLE; +} + void add_reliable_mem_size(long sz) { atomic_long_add(sz, &total_reliable_mem);
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Enable CONFIG_EFI_FAKE_MEMMAP & CONFIG_MEMORY_RELIABLE for arm64.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 3a415daa4e43..b476b105ee10 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -643,6 +643,8 @@ CONFIG_FW_CFG_SYSFS=y CONFIG_EFI_ESRT=y CONFIG_EFI_VARS_PSTORE=y # CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE is not set +CONFIG_EFI_FAKE_MEMMAP=y +CONFIG_EFI_MAX_FAKE_MEM=8 CONFIG_EFI_PARAMS_FROM_FDT=y CONFIG_EFI_RUNTIME_WRAPPERS=y CONFIG_EFI_GENERIC_STUB=y @@ -1104,6 +1106,7 @@ CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y CONFIG_ARCH_HAS_PTE_SPECIAL=y CONFIG_PIN_MEMORY=y CONFIG_PID_RESERVE=y +CONFIG_MEMORY_RELIABLE=y
# # Data Access Monitoring
From: Hanjun Guo guohanjun@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I4U4KV CVE: NA
----------------------------------------
acpi_get_table() should be coupled with acpi_put_table(), or it will leat to memory leak, fix the memory leak to call acpi_put_table().
Signed-off-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/tty/serial/amba-pl011.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 379d83432e7e..4f2c9378931e 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -1516,6 +1516,8 @@ static void pl011_check_hisi_workaround(void) break; } } + + acpi_put_table(tbl); }
#else
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4U3RI CVE: NA
--------------------------------
We found a NULL pointer dereference problem when using dm-mpath target. The problem is if we submit IO between loading and binding the table, we could neither get a valid dm_target nor a valid dm table when submitting request in dm_mq_queue_rq(). BIO based dm target could handle this case in dm_submit_bio(). This patch fix this by checking the mapping table before submitting request.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm-rq.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b1e867feb4f6..be708b7c66a1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -492,8 +492,15 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(!ti)) { int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - + struct dm_table *map; + + map = dm_get_live_table(md, &srcu_idx); + if (!map) { + DMERR_LIMIT("%s: mapping table unavailable, erroring io", + dm_device_name(md)); + dm_put_live_table(md, srcu_idx); + return BLK_STS_IOERR; + } ti = dm_table_find_target(map, 0); dm_put_live_table(md, srcu_idx); }
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: 186216, https://gitee.com/openeuler/kernel/issues/I4PW7R CVE: NA
--------------------------------
The same to commit 1c2d14212b15 ("ext2: Fix underflow in ext2_max_size()") in ext2 filesystem, ext4 driver has the same issue with 64K block size and ^huge_file, fix this issue the same as ext2. This patch also revert commit 75ca6ad408f4 ("ext4: fix loff_t overflow in ext4_max_bitmap_size()") because it's no longer needed.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/super.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 856733f756cf..19539be45aec 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3220,8 +3220,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { - unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; + loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; + unsigned int ppb = 1 << (bits - 2);
/* * This is calculated to be the largest file size for a dense, block @@ -3253,27 +3254,42 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
}
- /* indirect blocks */ - meta_blocks = 1; - /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - + /* Compute how many blocks we can address by block tree */ res += 1LL << (bits-2); res += 1LL << (2*(bits-2)); res += 1LL << (3*(bits-2)); + /* Compute how many metadata blocks are needed */ + meta_blocks = 1; + meta_blocks += 1 + ppb; + meta_blocks += 1 + ppb + ppb * ppb; + /* Does block tree limit file size? */ + if (res + meta_blocks <= upper_limit) + goto check_lfs; + + res = upper_limit; + /* How many metadata blocks are needed for addressing upper_limit? */ + upper_limit -= EXT4_NDIR_BLOCKS; + /* indirect blocks */ + meta_blocks = 1; + upper_limit -= ppb; + /* double indirect blocks */ + if (upper_limit < ppb * ppb) { + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb); + res -= meta_blocks; + goto check_lfs; + } + meta_blocks += 1 + ppb; + upper_limit -= ppb * ppb; + /* tripple indirect blocks for the rest */ + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) + + DIV_ROUND_UP(upper_limit, ppb*ppb); + res -= meta_blocks; +check_lfs: res <<= bits; - if (res > upper_limit) - res = upper_limit; - if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE;
- return (loff_t)res; + return res; }
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
From: Zhang Changzhong zhangchangzhong@huawei.com
mainline inclusion from mainline-v5.17-rc5 commit a6ab75cec1e461f8a35559054c146c21428430b8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4UV5G CVE: NA
--------------------------------
In __bond_release_one(), bond_set_carrier() is only called when bond device has no slave. Therefore, if we remove the up slave from a master with two slaves and keep the down slave, the master will remain up.
Fix this by moving bond_set_carrier() out of if (!bond_has_slaves(bond)) statement.
Reproducer: $ insmod bonding.ko mode=0 miimon=100 max_bonds=2 $ ifconfig bond0 up $ ifenslave bond0 eth0 eth1 $ ifconfig eth0 down $ ifenslave -d bond0 eth1 $ cat /proc/net/bonding/bond0
Fixes: ff59c4563a8d ("[PATCH] bonding: support carrier state for master") Signed-off-by: Zhang Changzhong zhangchangzhong@huawei.com Acked-by: Jay Vosburgh jay.vosburgh@canonical.com Link: https://lore.kernel.org/r/1645021088-38370-1-git-send-email-zhangchangzhong@... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Zhang Changzhong zhangchangzhong@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/bonding/bond_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index cb823e2da910..c065da5b6ca2 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2280,10 +2280,9 @@ static int __bond_release_one(struct net_device *bond_dev, bond_select_active_slave(bond); }
- if (!bond_has_slaves(bond)) { - bond_set_carrier(bond); + bond_set_carrier(bond); + if (!bond_has_slaves(bond)) eth_hw_addr_random(bond_dev); - }
unblock_netpoll_tx(); synchronize_rcu();