From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Introduce fallback mechanism for memory reliable. The following process will fallback to non-mirrored region if their allocation from mirrored region failed
- User tasks with reliable flag - thp collapse pages - init tasks - pagecache - tmpfs
In order to achieve this goals. Buddy system will fallback to non-mirrored in the following situations.
- if __GFP_THISNODE is set in gfp_mask and dest nodes do not have any zones available
- high_zoneidx will be set to ZONE_MOVABLE to alloc memory before oom
This mechanism is enabled by defalut and can be disabled by adding "reliable_debug=F" to the kernel parameters. This mechanism rely on CONFIG_MEMORY_RELIABLE and need "kernelcore=reliable" in the kernel parameters.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 9 +++ include/linux/mem_reliable.h | 7 ++ mm/khugepaged.c | 30 +++++--- mm/mem_reliable.c | 29 ++++++++ mm/page_alloc.c | 70 ++++++++++++++++++- 5 files changed, 133 insertions(+), 12 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cc5eec8959a07..3fc729aab31a6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1972,6 +1972,15 @@ Option "reliable" is base on option "mirror", but make some extension. These two features are alternatives.
+ reliable_debug= [ARM64] + Format: [F] + Only works with CONFIG_MEMORY_RELIABLE and + "kernelcore=reliable" is configured. + F: User tasks with PF_RELIABLE will not allocate + memory from non-mirrored region if this allocation + from mirrored region failed. + Pagecache and tmpfs will follow this rule too. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index e4097f0cff679..c9c4d94a4df46 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -16,6 +16,7 @@ extern struct static_key_false mem_reliable; extern bool reliable_enabled; extern atomic_long_t reliable_user_used_nr_page; extern unsigned long task_reliable_limit __read_mostly; +extern bool reliable_allow_fallback;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, @@ -69,8 +70,14 @@ static inline bool reliable_mem_limit_check(unsigned long nr_page) return atomic_long_read(&reliable_user_used_nr_page) + nr_page <= task_reliable_limit / PAGE_SIZE; } + +static inline bool reliable_allow_fb_enabled(void) +{ + return reliable_allow_fallback; +} #else #define reliable_enabled 0 +#define reliable_allow_fb_enabled() false
static inline bool mem_reliable_is_enabled(void) { return false; } static inline void add_reliable_mem_size(long sz) {} diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2b154ff6ee734..c9be18c669a17 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1235,10 +1235,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { - if (reliable && - !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { - ret = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out; + if (reliable) { + if (!reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + if (reliable_allow_fb_enabled()) { + reliable = false; + } else { + ret = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + } }
node = khugepaged_find_target_node(); @@ -1695,15 +1700,20 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, rcu_read_unlock();
if (result == SCAN_SUCCEED) { + if (reliable) { + if (!reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + if (reliable_allow_fb_enabled()) { + reliable = false; + } else { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + } + } + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; } else { - if (reliable && - !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out; - } - node = khugepaged_find_target_node(); collapse_shmem(mm, mapping, start, hpage, node, reliable); diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index c24c5b7cbca33..60a214e3b28f7 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -17,6 +17,7 @@ static atomic_long_t total_reliable_mem; atomic_long_t reliable_user_used_nr_page; /* reliable user limit for user tasks with reliable flag */ unsigned long task_reliable_limit = ULONG_MAX; +bool reliable_allow_fallback __read_mostly = true;
void add_reliable_mem_size(long sz) { @@ -204,3 +205,31 @@ void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, out_of_memory(&oc); mutex_unlock(&oom_lock); } + +static int __init setup_reliable_debug(char *str) +{ + if (*str++ != '=' || !*str) + /* + * No options specified. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for (; *str && *str != ','; str++) { + switch (*str) { + case 'F': + reliable_allow_fallback = false; + pr_info("fallback disabled."); + break; + default: + pr_err("reliable_debug option '%c' unknown. skipped\n", + *str); + } + } + +out: + return 1; +} +__setup("reliable_debug", setup_reliable_debug); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bfc0c2d1825cd..455525e49c727 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3656,6 +3656,60 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, return page; }
+#ifdef CONFIG_MEMORY_RELIABLE +static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return NULL; + + /* dst nodemask may don't have zone we want, fallback here */ + if ((gfp_mask & __GFP_THISNODE) && (ac->high_zoneidx == ZONE_NORMAL) && + (gfp_mask & ___GFP_RELIABILITY)) { + struct zoneref *ref = first_zones_zonelist( + ac->zonelist, ZONE_MOVABLE, ac->nodemask); + return ref->zone; + } + + return NULL; +} + +static inline struct page * +reliable_fb_before_oom(gfp_t gfp_mask, int order, + const struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return NULL; + + /* key user process alloc mem from movable zone to avoid oom */ + if ((ac->high_zoneidx == ZONE_NORMAL) && + (gfp_mask & ___GFP_RELIABILITY)) { + struct alloc_context tmp_ac = *ac; + + tmp_ac.high_zoneidx = ZONE_MOVABLE; + tmp_ac.preferred_zoneref = first_zones_zonelist( + ac->zonelist, ZONE_MOVABLE, ac->nodemask); + return get_page_from_freelist( + (gfp_mask | __GFP_HARDWALL) & ~__GFP_DIRECT_RECLAIM, + order, ALLOC_WMARK_HIGH | ALLOC_CPUSET, &tmp_ac); + } + + return NULL; +} +#else +static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, + struct alloc_context *ac) +{ + return NULL; +} + +static inline struct page *reliable_fb_before_oom(gfp_t gfp_mask, int order, + const struct alloc_context *ac) +{ + return NULL; +} +#endif + static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -3694,6 +3748,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out;
+ page = reliable_fb_before_oom(gfp_mask, order, ac); + if (page) + goto out; + /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) goto out; @@ -4301,8 +4359,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) - goto nopage; + if (!ac->preferred_zoneref->zone) { + ac->preferred_zoneref->zone = + reliable_fb_find_zone(gfp_mask, ac); + + if (!ac->preferred_zoneref->zone) + goto nopage; + }
if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, gfp_mask, ac); @@ -4602,6 +4665,9 @@ static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) return true; }
+ if (reliable_allow_fb_enabled()) + return true; + return false; }