From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
When the memory is insufficient or fragmentation is severe, the 2MB hugepage allocation will perform direct reclaim and compact.
The direct reclaim and compact may take a long time. As a result, sp mutex will be hold for too long time to casue the hung task problem. In this case, set the PF_MEMALLOC flag to prevent the direct reclaim and compact from being executed.
Direct compact is not allowed during hugepage allocation. As a result, 2MB hugepage may failed to be applied for.
When do sp alloc, if the 2MB hugepage cannot be allocated of the total free memory is less than 1/3 of total memory, a work is added compact the memory asynchronously.
When do sp free, if the total free memory is less than 1/3 of total memory, compact memory synchronously.
We can disable this and change the compact frequence through sysctl.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sysctl.c | 18 ++++++++++++++++ mm/share_pool.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8f417c7b12e8..c7073b652b0c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3269,6 +3269,24 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "sharepool_compact_enable", + .data = &sysctl_sp_compact_enable, + .maxlen = sizeof(sysctl_sp_compact_enable), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sharepool_compact_interval", + .data = &sysctl_sp_compact_interval, + .maxlen = sizeof(sysctl_sp_compact_interval), + .mode = 0600, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &sysctl_sp_compact_interval_max, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 0ea113b904cf..1cd1a64f2a8c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1018,6 +1018,63 @@ void sp_area_drop(struct vm_area_struct *vma) spin_unlock(&sp_area_lock); }
+int sysctl_sp_compact_enable; +unsigned long sysctl_sp_compact_interval = 30UL; +unsigned long sysctl_sp_compact_interval_max = 1000UL; +static unsigned long compact_last_jiffies; +static unsigned long compact_daemon_status; +#define COMPACT_START 1 +#define COMPACT_STOP 0 + +static void sp_compact_nodes(struct work_struct *work) +{ + sysctl_compaction_handler(NULL, 1, NULL, NULL, NULL); + + kfree(work); + + compact_last_jiffies = jiffies; + cmpxchg(&compact_daemon_status, COMPACT_START, COMPACT_STOP); +} + +static void sp_add_work_compact(void) +{ + struct work_struct *compact_work; + + if (!sysctl_sp_compact_enable) + return; + + /* experimental compaction time: 4GB->1.7s, 8GB->3.4s */ + if (!time_after(jiffies, + compact_last_jiffies + sysctl_sp_compact_interval * HZ)) + return; + + if (cmpxchg(&compact_daemon_status, COMPACT_STOP, COMPACT_START) == + COMPACT_START) + return; + + compact_work = kzalloc(sizeof(*compact_work), GFP_KERNEL); + if (!compact_work) + return; + + INIT_WORK(compact_work, sp_compact_nodes); + schedule_work(compact_work); +} + +static void sp_try_to_compact(void) +{ + unsigned long totalram; + unsigned long freeram; + + totalram = totalram_pages(); + freeram = global_zone_page_state(NR_FREE_PAGES); + + /* free < total / 3 */ + if ((freeram + (freeram << 1)) > totalram) + return; + + sp_add_work_compact(); +} + /** * sp_free() - Free the memory allocated by sp_alloc(). * @addr: the starting VA of the memory.