From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM01 CVE: NA
--------------------------------
Introduction
============
Memory reliable feature is a memory tiering mechanism. It is based on kernel mirror feature, which splits memory into two separate regions, mirrored(reliable) region and non-mirrored (non-reliable) region.
for kernel mirror feature:
- allocate kernel memory from mirrored region by default - allocate user memory from non-mirrored region by default
non-mirrored region will be arranged into ZONE_MOVABLE.
for kernel reliable feature, it has additional features below:
- normal user tasks never alloc memory from mirrored region with userspace apis(malloc, mmap, etc.) - special user tasks will allocate memory from mirrored region by default - shmem/pagecache allocate memory from mirrored region by default - upper limit of mirrored region allocated for user tasks, shmem and page cache
Support Reliable fallback mechanism which allows special user tasks, shmem and page cache can fallback to alloc non-mirrored region, it's the default setting.
In order to fulfill the goal
- ___GFP_RELIABLE flag added for alloc memory from mirrored region.
- the high_zoneidx for special user tasks/shmem/pagecache is set to ZONE_NORMAL.
- normal user tasks could only alloc from ZONE_MOVABLE.
This patch is just the main framework, memory reliable support for special user tasks, page cache and shmem has their own patches.
To enable this function, mirrored(reliable) memory is needed and "kernelcore=reliable" should be added to kernel parameters.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com --- .../admin-guide/kernel-parameters.txt | 6 +- include/linux/gfp.h | 6 ++ include/linux/gfp_types.h | 10 ++- include/linux/mem_reliable.h | 79 +++++++++++++++++++ include/linux/mm.h | 3 + include/trace/events/mmflags.h | 3 +- mm/Kconfig | 17 ++++ mm/Makefile | 1 + mm/mem_reliable.c | 65 +++++++++++++++ mm/mm_init.c | 28 ++++++- mm/page_alloc.c | 4 + tools/perf/builtin-kmem.c | 1 + 12 files changed, 219 insertions(+), 4 deletions(-) create mode 100644 include/linux/mem_reliable.h create mode 100644 mm/mem_reliable.c
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e755f76f76bd..01fc38217459 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2449,7 +2449,7 @@ keepinitrd [HW,ARM]
kernelcore= [KNL,X86,IA-64,PPC] - Format: nn[KMGTPE] | nn% | "mirror" + Format: nn[KMGTPE] | nn% | "mirror" | "reliable" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested amount is spread evenly throughout all nodes in the @@ -2473,6 +2473,10 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms.
+ Option "reliable" is base on option "mirror", but make + some extension. These two features are alternatives. + Current only arm64 is supported. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5b917e5b9350..83a75c7344c3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -134,6 +134,12 @@ static inline enum zone_type gfp_zone(gfp_t flags) z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + +#ifdef CONFIG_MEMORY_RELIABLE + if (z == ZONE_MOVABLE && (flags & GFP_RELIABLE)) + return ZONE_NORMAL; +#endif + return z; }
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6583a58670c5..d88913d62431 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,11 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -/* 0x200u unused */ +#ifdef CONFIG_MEMORY_RELIABLE +#define ___GFP_RELIABLE 0x200u +#else +#define ___GFP_RELIABLE 0 +#endif #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -248,6 +252,9 @@ typedef unsigned int __bitwise gfp_t; /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+/* Alloc memory from mirrored region */ +#define __GFP_RELIABLE ((__force gfp_t)___GFP_RELIABLE) + /* Room for N __GFP_FOO bits */ #define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) @@ -336,5 +343,6 @@ typedef unsigned int __bitwise gfp_t; #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) +#define GFP_RELIABLE __GFP_RELIABLE
#endif /* __LINUX_GFP_TYPES_H */ diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h new file mode 100644 index 000000000000..5df1c157a2bd --- /dev/null +++ b/include/linux/mem_reliable.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_MEM_RELIABLE__ +#define __MM_MEM_RELIABLE__ + +#ifdef CONFIG_MEMORY_RELIABLE + +#include <linux/stddef.h> +#include <linux/gfp.h> +#include <linux/mmzone.h> +#include <linux/mm_types.h> +#include <linux/sched.h> + +DECLARE_STATIC_KEY_FALSE(mem_reliable); + +extern bool reliable_enabled; + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long mirrored_sz); +bool mem_reliable_status(void); + +static inline bool mem_reliable_is_enabled(void) +{ + return static_branch_likely(&mem_reliable); +} + +static inline bool page_reliable(struct page *page) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!page) + return false; + + return page_zonenum(page) < ZONE_MOVABLE; +} + +static inline bool folio_reliable(struct folio *folio) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!folio) + return false; + + return folio_zonenum(folio) < ZONE_MOVABLE; +} + +static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!current->mm || (current->flags & PF_KTHREAD)) + return false; + + /* user tasks can only alloc memory from non-mirrored region */ + if (!(gfp & GFP_RELIABLE) && (gfp & __GFP_HIGHMEM) && + (gfp & __GFP_MOVABLE)) { + if (zonelist_zone_idx(z) < ZONE_MOVABLE) + return true; + } + + return false; +} +#else +#define reliable_enabled 0 + +static inline bool mem_reliable_is_enabled(void) { return false; } +static inline void mem_reliable_init(bool has_unmirrored_mem, + unsigned long mirrored_sz) {} +static inline bool page_reliable(struct page *page) { return false; } +static inline bool folio_reliable(struct folio *folio) { return false; } +static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z) +{ + return false; +} +static inline bool mem_reliable_status(void) { return false; } +#endif + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bacc4da324..ef8987fa42d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4089,4 +4089,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
#endif
+/* added to mm.h to avoid every caller adding new header file */ +#include <linux/mem_reliable.h> + #endif /* _LINUX_MM_H */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 18d30581137a..6e24b2fbc445 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -50,7 +50,8 @@ gfpflag_string(__GFP_RECLAIM), \ gfpflag_string(__GFP_DIRECT_RECLAIM), \ gfpflag_string(__GFP_KSWAPD_RECLAIM), \ - gfpflag_string(__GFP_ZEROTAGS) + gfpflag_string(__GFP_ZEROTAGS), \ + gfpflag_string(__GFP_RELIABLE)
#ifdef CONFIG_KASAN_HW_TAGS #define __def_gfpflag_names_kasan , \ diff --git a/mm/Kconfig b/mm/Kconfig index c277bb069ab7..2df11b146c84 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1348,6 +1348,23 @@ config CLEAR_FREELIST_PAGE To enable this feature, kernel parameter "clear_freelist" also needs to be added.
+config MEMORY_RELIABLE + bool "Support for memory reliable" + depends on ARM64 + default n + help + Memory reliable is based on mirror memory. It has the following + additional features: + a) normal user tasks never alloc memory from mirrored region; + b) special user tasks will allocate memory from mirrored region + by default; c) upper limit of mirrored region allcated for user + tasks, shmem and pagecache. + Special user tasks and shmem/pagecache can fallback to + non-mirrored region if you enable reliable fallback mechanism. + + To enable this function, mirrored memory is needed and + "kernelcore=reliable" need to be added in kernel parameters. + source "mm/damon/Kconfig"
endmenu diff --git a/mm/Makefile b/mm/Makefile index 6759053ed782..e1a853e31856 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -144,3 +144,4 @@ obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o +obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c new file mode 100644 index 000000000000..53d11d48555e --- /dev/null +++ b/mm/mem_reliable.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "mem reliable: " fmt + +#include <linux/mm.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/crash_dump.h> + +#define PAGES_TO_B(n_pages) ((n_pages) << PAGE_SHIFT) + +DEFINE_STATIC_KEY_FALSE(mem_reliable); +EXPORT_SYMBOL_GPL(mem_reliable); + +bool reliable_enabled; + +bool mem_reliable_status(void) +{ + return mem_reliable_is_enabled(); +} +EXPORT_SYMBOL_GPL(mem_reliable_status); + +static unsigned long total_reliable_pages(void) +{ + unsigned long total_reliable_pages = 0; + struct zone *z; + + for_each_populated_zone(z) + if (zone_idx(z) < ZONE_MOVABLE) + total_reliable_pages += zone_managed_pages(z); + + return total_reliable_pages; +} + +static unsigned long free_reliable_pages(void) +{ + struct zone *zone; + unsigned long cnt = 0; + + for_each_populated_zone(zone) + if (zone_idx(zone) < ZONE_MOVABLE) + cnt += zone_page_state(zone, NR_FREE_PAGES); + + return cnt; +} + +static unsigned long used_reliable_pages(void) +{ + return total_reliable_pages() - free_reliable_pages(); +} + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long mirrored_sz) +{ + if (!reliable_enabled) + return; + + if (!has_unmirrored_mem) { + pr_err("init failed, unmirrored memory size is zero.\n"); + return; + } + + static_branch_enable(&mem_reliable); + + pr_info("init succeed, mirrored memory size(%lu)\n", mirrored_sz); +} diff --git a/mm/mm_init.c b/mm/mm_init.c index 2318ae6de638..0a3c20a00318 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -269,10 +269,29 @@ static int __init cmdline_parse_kernelcore(char *p) { /* parse kernelcore=mirror */ if (parse_option_str(p, "mirror")) { + if (reliable_enabled) { + pr_warn("kernelcore=reliable and kernelcore=mirror are alternative.\n"); + return -EINVAL; + } + mirrored_kernelcore = true; return 0; }
+#ifdef CONFIG_MEMORY_RELIABLE + /* parse kernelcore=reliable */ + if (parse_option_str(p, "reliable")) { + if (!reliable_enabled && mirrored_kernelcore) { + pr_warn("kernelcore=mirror and kernelcore=reliable are alternative.\n"); + return -EINVAL; + } + + reliable_enabled = true; + mirrored_kernelcore = true; + return 0; + } +#endif + return cmdline_parse_core(p, &required_kernelcore, &required_kernelcore_percent); } @@ -376,6 +395,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + bool has_unmirrored_mem = false; + unsigned long mirrored_sz = 0;
if (!memblock_has_mirror()) { pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n"); @@ -388,8 +409,10 @@ static void __init find_zone_movable_pfns_for_nodes(void) }
for_each_mem_region(r) { - if (memblock_is_mirror(r)) + if (memblock_is_mirror(r)) { + mirrored_sz += r->size; continue; + }
nid = memblock_get_region_node(r);
@@ -400,6 +423,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) continue; }
+ has_unmirrored_mem = true; zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -408,6 +432,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory.\n");
+ mem_reliable_init(has_unmirrored_mem, mirrored_sz); + goto out2; }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5b61c1060d1..efde73e24b3b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3189,6 +3189,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct page *page; unsigned long mark;
+ /* skip non-mirrored zone for normal user tasks */ + if (skip_non_mirrored_zone(gfp_mask, z)) + continue; + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !__cpuset_zone_allowed(zone, gfp_mask)) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 9714327fd0ea..db9ca1d84b74 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -682,6 +682,7 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, + { "__GFP_RELIABLE", "REL" }, };
static size_t max_gfp_len;