From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Introduction Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com
============
Memory reliable feature is a memory tiering mechanism. It is based on kernel mirror feature, which splits memory into two sperate regions, mirrored(reliable) region and non-mirrored (non-reliable) region.
for kernel mirror feature:
- allocate kernel memory from mirrored region by default - allocate user memory from non-mirrored region by default
non-mirrored region will be arranged into ZONE_MOVABLE.
for kernel reliable feature, it has additional features below:
- normal user tasks never alloc memory from mirrored region with userspace apis(malloc, mmap, etc.) - special user tasks will allocate memory from mirrored region by default - tmpfs/pagecache allocate memory from mirrored region by default - upper limit of mirrored region allcated for user tasks, tmpfs and pagecache
Support Reliable fallback mechanism which allows special user tasks, tmpfs and pagecache can fallback to alloc non-mirrored region, it's the default setting.
In order to fulfil the goal
- ___GFP_RELIABILITY flag added for alloc memory from mirrored region.
- the high_zoneidx for special user tasks/tmpfs/pagecache is set to ZONE_NORMAL.
- normal user tasks could only alloc from ZONE_MOVABLE.
This patch is just the main framework, memory reliable support for special user tasks, pagecache and tmpfs has own patches.
To enable this function, mirrored(reliable) memory is needed and "kernelcore=reliable" should be added to kernel parameters.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 5 +- include/linux/gfp.h | 12 +++ include/linux/mem_reliable.h | 64 +++++++++++++++ include/linux/mm.h | 3 + mm/Kconfig | 18 +++++ mm/Makefile | 1 + mm/mem_reliable.c | 78 +++++++++++++++++++ mm/page_alloc.c | 46 ++++++++++- 8 files changed, 224 insertions(+), 3 deletions(-) create mode 100644 include/linux/mem_reliable.h create mode 100644 mm/mem_reliable.c
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 886c900323f14..cc5eec8959a07 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1945,7 +1945,7 @@ keepinitrd [HW,ARM]
kernelcore= [KNL,X86,IA-64,PPC,ARM64] - Format: nn[KMGTPE] | nn% | "mirror" + Format: nn[KMGTPE] | nn% | "mirror" | "reliable" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested amount is spread evenly throughout all nodes in the @@ -1969,6 +1969,9 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms.
+ Option "reliable" is base on option "mirror", but make + some extension. These two features are alternatives. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f78d1e89593fd..152cb9bdf4365 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -45,6 +45,12 @@ struct vm_area_struct; #define ___GFP_NOLOCKDEP 0 #endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ +#ifdef CONFIG_MEMORY_RELIABLE +/* add flag at the end of gfp_mask to aovid kapi change */ +#define ___GFP_RELIABILITY 0x40000000u +#else +#define ___GFP_RELIABILITY 0 +#endif
/* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -446,6 +452,12 @@ static inline enum zone_type gfp_zone(gfp_t flags) z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + +#ifdef CONFIG_MEMORY_RELIABLE + if (z == ZONE_MOVABLE && flags & ___GFP_RELIABILITY) + return ZONE_NORMAL; +#endif + return z; }
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h new file mode 100644 index 0000000000000..b03108441e37a --- /dev/null +++ b/include/linux/mem_reliable.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_MEM_RELIABLE__ +#define __MM_MEM_RELIABLE__ + +#include <linux/stddef.h> +#include <linux/gfp.h> +#include <linux/mmzone.h> +#include <linux/mm_types.h> +#include <linux/sched.h> + + +#ifdef CONFIG_MEMORY_RELIABLE + +extern struct static_key_false mem_reliable; + +extern bool reliable_enabled; + +extern void add_reliable_mem_size(long sz); +extern void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn); + +static inline bool mem_reliable_is_enabled(void) +{ + return static_branch_likely(&mem_reliable); +} + +static inline bool zone_reliable(struct zone *zone) +{ + return mem_reliable_is_enabled() && zone_idx(zone) < ZONE_MOVABLE; +} + +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!current->mm || (current->flags & PF_KTHREAD)) + return false; + + /* user tasks can only alloc memory from non-mirrored region */ + if (!(gfp & ___GFP_RELIABILITY) && (gfp & __GFP_HIGHMEM) && + (gfp & __GFP_MOVABLE)) { + if (zonelist_zone_idx(z) < ZONE_MOVABLE) + return true; + } + + return false; +} +#else +#define reliable_enabled 0 + +static inline bool mem_reliable_is_enabled(void) { return false; } +static inline void add_reliable_mem_size(long sz) {} +static inline void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn) {} +static inline bool zone_reliable(struct zone *zone) { return false; } +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + return false; +} + +#endif + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index be0be448c3f19..630b103065f4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -28,6 +28,9 @@ #include <linux/memremap.h> #include <linux/overflow.h>
+/* added to mm.h to avoid every caller adding new header file */ +#include <linux/mem_reliable.h> + struct mempolicy; struct anon_vma; struct anon_vma_chain; diff --git a/mm/Kconfig b/mm/Kconfig index 12601505c4a4a..80d7b47ca9f53 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -831,4 +831,22 @@ config PID_RESERVE We record the pid of dump task in the reserve memory, and reserve the pids before init task start. In restore process, free the reserved pids and realloc them for use. + +config MEMORY_RELIABLE + bool "Support for memory reliable" + depends on ARM64 + default n + help + Memory reliable is based on mirror memory. It has the following + additional features: + a) normal user tasks never alloc memory from mirrored region; + b) special user tasks will allocate memory from mirrored region + by default; c) upper limit of mirrored region allcated for user + tasks, tmpfs and pagecache. + Special user tasks and tmpfs/pagecache can fallback to + non-mirrored region if you enable reliable fallback mechanism. + + To enable this function, mirrored memory is needed and + "kernelcore=reliable" need to be added in kernel parameters. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 8fba091be3868..741f9c250914c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,3 +109,4 @@ obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c new file mode 100644 index 0000000000000..2e21839ca49fb --- /dev/null +++ b/mm/mem_reliable.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "mem reliable: " fmt + + +#include <linux/mm.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> + +DEFINE_STATIC_KEY_FALSE(mem_reliable); + +bool reliable_enabled; + +static atomic_long_t total_reliable_mem; + +void add_reliable_mem_size(long sz) +{ + atomic_long_add(sz, &total_reliable_mem); +} + +static int reliable_mem_notifier(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct memory_notify *m_arg = arg; + struct zone *zone; + + switch (action) { + case MEM_ONLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(m_arg->nr_pages * PAGE_SIZE); + break; + case MEM_OFFLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(-m_arg->nr_pages * PAGE_SIZE); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block reliable_notifier_block = { + .notifier_call = reliable_mem_notifier, +}; + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) +{ + if (!reliable_enabled) + return; + + if (atomic_long_read(&total_reliable_mem) == 0) { + memset(zone_movable_pfn, 0, + sizeof(unsigned long) * MAX_NUMNODES); + + pr_err("init failed, mirrored memory size is zero."); + + return; + } + + if (!has_unmirrored_mem) { + pr_err("init failed, unmirrored memory size is zero."); + + return; + } + + if (register_hotmemory_notifier(&reliable_notifier_block)) { + pr_err("init failed, register memory notifier failed."); + return; + } + + static_branch_enable(&mem_reliable); + + pr_info("init succeed, mirrored memory size(%lu)", + atomic_long_read(&total_reliable_mem)); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4cad86f1e3a91..e1e513e851dec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3454,6 +3454,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct page *page; unsigned long mark;
+ /* skip non-movable zone for normal user tasks */ + if (skip_none_movable_zone(gfp_mask, z)) + continue; + /* * CDM nodes get skipped if the requested gfp flag * does not have __GFP_THISNODE set or the nodemask @@ -4557,6 +4561,18 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) ac->high_zoneidx, ac->nodemask); }
+static inline void prepare_before_alloc(gfp_t *gfp_mask) +{ + gfp_t gfp_ori = *gfp_mask; + *gfp_mask &= gfp_allowed_mask; + + if (!mem_reliable_is_enabled()) + return; + + if (gfp_ori & ___GFP_RELIABILITY) + *gfp_mask |= ___GFP_RELIABILITY; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -4578,7 +4594,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, return NULL; }
- gfp_mask &= gfp_allowed_mask; + prepare_before_alloc(&gfp_mask); + alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; @@ -6912,10 +6929,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + bool has_unmirrored_mem = false;
for_each_memblock(memory, r) { - if (memblock_is_mirror(r)) + if (memblock_is_mirror(r)) { + add_reliable_mem_size(r->size); continue; + }
nid = r->nid;
@@ -6926,6 +6946,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) continue; }
+ has_unmirrored_mem = true; zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -6934,6 +6955,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory.");
+ mem_reliable_init(has_unmirrored_mem, zone_movable_pfn); + goto out2; }
@@ -7226,9 +7249,28 @@ static int __init cmdline_parse_kernelcore(char *p) { /* parse kernelcore=mirror */ if (parse_option_str(p, "mirror")) { + if (reliable_enabled) { + pr_info("kernelcore=reliable and kernelcore=mirror are alternative."); + return -EINVAL; + } + + mirrored_kernelcore = true; + return 0; + } + +#ifdef CONFIG_MEMORY_RELIABLE + /* parse kernelcore=reliable */ + if (parse_option_str(p, "reliable")) { + if (!reliable_enabled && mirrored_kernelcore) { + pr_info("kernelcore=mirror and kernelcore=reliable are alternative."); + return -EINVAL; + } + + reliable_enabled = true; mirrored_kernelcore = true; return 0; } +#endif
return cmdline_parse_core(p, &required_kernelcore, &required_kernelcore_percent);