From: Yang Yingliang yangyingliang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4IYRE
---------------------------
If system has many cpus (e.g. 128), it will spend a lot of time to print message to the console when execute echo q > /proc/sysrq-trigger. When /proc/sys/kernel/numa_balancing is enabled, if the migration threads is woke up, the thread cannot continue until the print finish, it will trigger a soft lockup.
PID: 619 TASK: ffffa02fdd8bec80 CPU: 121 COMMAND: "migration/121" #0 [ffff00000a103b10] __crash_kexec at ffff0000081bf200 #1 [ffff00000a103ca0] panic at ffff0000080ec93c #2 [ffff00000a103d80] watchdog_timer_fn at ffff0000081f8a14 #3 [ffff00000a103e00] __run_hrtimer at ffff00000819701c #4 [ffff00000a103e40] __hrtimer_run_queues at ffff000008197420 #5 [ffff00000a103ea0] hrtimer_interrupt at ffff00000819831c #6 [ffff00000a103f10] arch_timer_dying_cpu at ffff000008b53144 #7 [ffff00000a103f30] handle_percpu_devid_irq at ffff000008174e34 #8 [ffff00000a103f70] generic_handle_irq at ffff00000816c5e8 #9 [ffff00000a103f90] __handle_domain_irq at ffff00000816d1f4 #10 [ffff00000a103fd0] gic_handle_irq at ffff000008081860 --- <IRQ stack> --- #11 [ffff00000d6e3d50] el1_irq at ffff0000080834c8 #12 [ffff00000d6e3d60] multi_cpu_stop at ffff0000081d9964 #13 [ffff00000d6e3db0] cpu_stopper_thread at ffff0000081d9cfc #14 [ffff00000d6e3e10] smpboot_thread_fn at ffff00000811e0a8 #15 [ffff00000d6e3e70] kthread at ffff000008118988
To avoid this soft lockup, add touch_all_softlockup_watchdogs() in sysrq_timer_list_show()
Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-By: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: wangxiongfeng 00379786 wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/time/timer_list.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index acb326f5f50a..4cb0e6f62e97 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -289,13 +289,17 @@ void sysrq_timer_list_show(void)
timer_list_header(NULL, now);
- for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + touch_all_softlockup_watchdogs(); print_cpu(NULL, cpu, now); + }
#ifdef CONFIG_GENERIC_CLOCKEVENTS timer_list_show_tickdevices_header(NULL); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + touch_all_softlockup_watchdogs(); print_tickdevice(NULL, tick_get_device(cpu), cpu); + } #endif return; }
hulk inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4J6BS CVE: NA
-------------------------------------------------
In order to support Intel Icelake platform, following configs need to be set as suggested by Intel:
CONFIG_ACPI_HMAT=y CONFIG_EDAC_I10NM=m CONFIG_INTEL_SPEED_SELECT_INTERFACE=m CONFIG_STM=m CONFIG_STM_DUMMY=m CONFIG_STM_SOURCE_CONSOLE=m CONFIG_STM_SOURCE_HEARTBEAT=m ONFIG_STM_SOURCE_FTRACE=m CONFIG_INTEL_TH=m CONFIG_INTEL_TH_PCI=m CONFIG_INTEL_TH_ACPI=m CONFIG_INTEL_TH_GTH=m CONFIG_INTEL_TH_STH=m CONFIG_INTEL_TH_MSU=m CONFIG_INTEL_TH_PTI=m
Set above configs in openeuler_defconfig by default.
Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/configs/openeuler_defconfig | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b25d908dc7a1..7b608301823c 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -550,7 +550,7 @@ CONFIG_ACPI_BGRT=y CONFIG_ACPI_NFIT=m # CONFIG_NFIT_SECURITY_DEBUG is not set CONFIG_ACPI_NUMA=y -# CONFIG_ACPI_HMAT is not set +CONFIG_ACPI_HMAT=y CONFIG_HAVE_ACPI_APEI=y CONFIG_HAVE_ACPI_APEI_NMI=y CONFIG_ACPI_APEI=y @@ -6379,7 +6379,7 @@ CONFIG_EDAC_I5100=m CONFIG_EDAC_I7300=m CONFIG_EDAC_SBRIDGE=m CONFIG_EDAC_SKX=m -# CONFIG_EDAC_I10NM is not set +CONFIG_EDAC_I10NM=m CONFIG_EDAC_PND2=m CONFIG_RTC_LIB=y CONFIG_RTC_MC146818_LIB=y @@ -6708,7 +6708,7 @@ CONFIG_INTEL_RST=m # # Intel Speed Select Technology interface support # -# CONFIG_INTEL_SPEED_SELECT_INTERFACE is not set +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m # end of Intel Speed Select Technology interface support
CONFIG_INTEL_TURBO_MAX_3=y @@ -7395,8 +7395,18 @@ CONFIG_NVMEM_SYSFS=y # # HW tracing support # -# CONFIG_STM is not set -# CONFIG_INTEL_TH is not set +CONFIG_STM=m +CONFIG_STM_DUMMY=m +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m # end of HW tracing support
# CONFIG_FPGA is not set
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK
------------------------------------------
Add 3 proc interface to control page cache limit behavior: 1. switch of the this feature 2. control the page cache limit 3. control the ratio to reclaim when page cache beyond page cache limit
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 14 +++++++++ kernel/sysctl.c | 32 ++++++++++++++++++++ mm/Kconfig | 12 ++++++++ mm/Makefile | 1 + mm/page_cache_limit.c | 51 ++++++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+) create mode 100644 include/linux/page_cache_limit.h create mode 100644 mm/page_cache_limit.c
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h new file mode 100644 index 000000000000..98f12734114b --- /dev/null +++ b/include/linux/page_cache_limit.h @@ -0,0 +1,14 @@ +#ifndef _PAGECACHE_H +#define _PAGECACHE_H + +#ifdef CONFIG_SHRINK_PAGECACHE +extern int pagecache_reclaim_enable; +extern int pagecache_limit_ratio; +extern int pagecache_reclaim_ratio; + +int proc_page_cache_limit(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#else +#endif + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3ab6ea7853ba..b3ee0deaa8dd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -103,6 +103,9 @@ #ifdef CONFIG_LOCKUP_DETECTOR #include <linux/nmi.h> #endif +#ifdef CONFIG_SHRINK_PAGECACHE +#include <linux/page_cache_limit.h> +#endif
#if defined(CONFIG_SYSCTL)
@@ -3192,6 +3195,35 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif +#ifdef CONFIG_SHRINK_PAGECACHE + { + .procname = "cache_reclaim_enable", + .data = &pagecache_reclaim_enable, + .maxlen = sizeof(pagecache_reclaim_enable), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "cache_limit_ratio", + .data = &pagecache_limit_ratio, + .maxlen = sizeof(pagecache_limit_ratio), + .mode = 0600, + .proc_handler = proc_page_cache_limit, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&one_hundred, + }, + { + .procname = "cache_reclaim_ratio", + .data = &pagecache_reclaim_ratio, + .maxlen = sizeof(pagecache_reclaim_ratio), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&one_hundred, + }, #endif { } }; diff --git a/mm/Kconfig b/mm/Kconfig index 59fdace319fd..f565fc82c200 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -486,6 +486,18 @@ config FRONTSWAP
If unsure, say Y to enable frontswap.
+config SHRINK_PAGECACHE + bool "Enable shrinking the page cache" + depends on MMU + default n + help + SHRINK_PAGECACHE means that we do not want to keep the large number + of page cache in the system, even though page cache can greatly improve + the performance of the machine. Large number of page cache may result + in short of memory, which will result OOM at the same time, so in order + to keep page cache in a reasonable range, the number of page cache + should be limited, and that is what SHRINK_PAGECACHE does. + config MEMCG_QOS bool "Enable Memory Cgroup Priority" depends on MEMCG diff --git a/mm/Makefile b/mm/Makefile index 4d07adb60619..c14522bd17ed 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -125,3 +125,4 @@ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o +obj-$(CONFIG_SHRINK_PAGECACHE) += page_cache_limit.o diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c new file mode 100644 index 000000000000..55fdea087804 --- /dev/null +++ b/mm/page_cache_limit.c @@ -0,0 +1,51 @@ +#include <linux/mm.h> +#include <linux/sysctl.h> + +int pagecache_reclaim_enable; +int pagecache_limit_ratio; +int pagecache_reclaim_ratio; + +static unsigned long pagecache_limit_pages; +static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; + +static unsigned long get_node_total_pages(int nid) +{ + int zone_type; + unsigned long managed_pages = 0; + pg_data_t *pgdat = NODE_DATA(nid); + + if (!pgdat) + return 0; + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); + + return managed_pages; +} + +static void setup_pagecache_limit(void) +{ + int i; + unsigned long node_total_pages; + + pagecache_limit_pages = pagecache_limit_ratio * totalram_pages() / 100; + + for (i = 0; i < MAX_NUMNODES; i++) { + node_total_pages = get_node_total_pages(i); + node_pagecache_limit_pages[i] = node_total_pages * + pagecache_limit_ratio / 100; + } +} + +int proc_page_cache_limit(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (write && !ret) + setup_pagecache_limit(); + + return ret; +}
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK
------------------------------------------
Create kthread for each node, when we choose to reclaim page cache asynchronously, such kthreads will be wake up by demand.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_cache_limit.c | 133 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+)
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 55fdea087804..4afc08373a35 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -1,5 +1,9 @@ #include <linux/mm.h> #include <linux/sysctl.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/err.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; @@ -7,6 +11,8 @@ int pagecache_reclaim_ratio;
static unsigned long pagecache_limit_pages; static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; +static wait_queue_head_t *pagecache_limitd_wait_queue[MAX_NUMNODES]; +static struct task_struct *pagecache_limitd_tasks[MAX_NUMNODES];
static unsigned long get_node_total_pages(int nid) { @@ -49,3 +55,130 @@ int proc_page_cache_limit(struct ctl_table *table, int write,
return ret; } + +void kpagecache_limitd_stop(int nid) +{ + if (nid < 0 || nid >= MAX_NUMNODES) + return; + + if (pagecache_limitd_tasks[nid]) { + kthread_stop(pagecache_limitd_tasks[nid]); + pagecache_limitd_tasks[nid] = NULL; + } + + if (pagecache_limitd_wait_queue[nid]) { + kvfree(pagecache_limitd_wait_queue[nid]); + pagecache_limitd_wait_queue[nid] = NULL; + } +} + +static void wakeup_kpagecache_limitd(int nid) +{ + if (!pagecache_limitd_wait_queue[nid]) + return; + + if (!waitqueue_active(pagecache_limitd_wait_queue[nid])) + return; + + wake_up_interruptible(pagecache_limitd_wait_queue[nid]); +} + +static bool pagecache_overlimit(void) +{ + unsigned long total_pagecache; + + total_pagecache = global_node_page_state(NR_FILE_PAGES); + total_pagecache -= global_node_page_state(NR_SHMEM); + + return total_pagecache > pagecache_limit_pages; +} + +void wakeup_all_kpagecache_limitd(void) +{ + int nid; + + if (!pagecache_reclaim_enable || !pagecache_overlimit()) + return; + + for_each_node_state(nid, N_MEMORY) + wakeup_kpagecache_limitd(nid); +} + +static void shrink_page_cache(void) +{ + if (!pagecache_overlimit()) + return; +} + +static DECLARE_COMPLETION(setup_done); +static int pagecache_limitd(void *arg) +{ + DEFINE_WAIT(wait); + int nid = *(int *)arg; + + if (nid < 0 || nid >= MAX_NUMNODES) + nid = numa_node_id(); + + complete(&setup_done); + set_freezable(); + for (;;) { + try_to_freeze(); + shrink_page_cache(); + + prepare_to_wait(pagecache_limitd_wait_queue[nid], &wait, + TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + schedule(); + finish_wait(pagecache_limitd_wait_queue[nid], &wait); + } + + finish_wait(pagecache_limitd_wait_queue[nid], &wait); + + return 0; +} + +int kpagecache_limitd_run(int nid) +{ + int ret = 0; + wait_queue_head_t *queue_head = NULL; + + if (pagecache_limitd_tasks[nid] && pagecache_limitd_wait_queue[nid]) + return 0; + + queue_head = kvmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + if (!queue_head) + return -ENOMEM; + + init_waitqueue_head(queue_head); + pagecache_limitd_wait_queue[nid] = queue_head; + pagecache_limitd_tasks[nid] = kthread_run(pagecache_limitd, + (void *)&nid, "kpagecache_limitd%d", nid); + + if (IS_ERR(pagecache_limitd_tasks[nid])) { + BUG_ON(system_state < SYSTEM_RUNNING); + ret = PTR_ERR(pagecache_limitd_tasks[nid]); + pr_err("Failed to start pagecache_limitd on node %d\n", nid); + pagecache_limitd_tasks[nid] = NULL; + kvfree(queue_head); + } else + wait_for_completion(&setup_done); + + return ret; +} + +static int __init kpagecache_limitd_init(void) +{ + int nid; + int ret; + + for_each_node_state(nid, N_MEMORY) { + ret = kpagecache_limitd_run(nid); + if (ret == -ENOMEM) + break; + } + + return 0; +} + +module_init(kpagecache_limitd_init);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK
------------------------------------------
In NUMA system, each node may have different pages, so pages that to reclaim should be calculated separately when the number of page cache beyond page cache limit.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_cache_limit.c | 52 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-)
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 4afc08373a35..33164e19cfa2 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -104,10 +104,60 @@ void wakeup_all_kpagecache_limitd(void) wakeup_kpagecache_limitd(nid); }
+static unsigned long node_nr_page_cache(int nid) +{ + struct pglist_data *pgdat; + unsigned long num = 0; + + pgdat = NODE_DATA(nid); + if (!pgdat) + return 0; + + num = node_page_state(pgdat, NR_FILE_PAGES); + num -= node_page_state(pgdat, NR_SHMEM); + + return num; +} + +static unsigned long node_nr_page_reclaim(int nid) +{ + unsigned long nr_page_cache; + unsigned long nr_to_reclaim; + unsigned long total_pages; + + if (!node_pagecache_limit_pages[nid]) + return 0; + + nr_page_cache = node_nr_page_cache(nid); + if (!nr_page_cache) + return 0; + + if (nr_page_cache < node_pagecache_limit_pages[nid]) + return 0; + + total_pages = get_node_total_pages(nid); + nr_to_reclaim = nr_page_cache - node_pagecache_limit_pages[nid]; + nr_to_reclaim += total_pages * pagecache_reclaim_ratio / 100; + + return nr_to_reclaim; +} + +static void shrink_node_page_cache(int nid) +{ + unsigned long nr_to_reclaim; + + nr_to_reclaim = node_nr_page_reclaim(nid); +} + static void shrink_page_cache(void) { - if (!pagecache_overlimit()) + int nid; + + if (!pagecache_reclaim_enable || !pagecache_overlimit()) return; + + for_each_node_state(nid, N_MEMORY) + shrink_node_page_cache(nid); }
static DECLARE_COMPLETION(setup_done);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK
------------------------------------------
Add basic shrink page logical. Slab pages and anonymous page will not be reclaim, besides reclaim behavior follows the following ruls:
1. reclaim pages that dont need unmap first 2. reclaim pages that nedd unmap second 3. reclaim dirty page at last
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 9 ++++++ mm/page_cache_limit.c | 27 +++++++++++++--- mm/vmscan.c | 53 ++++++++++++++++++++++++++++++-- 3 files changed, 83 insertions(+), 6 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 98f12734114b..e4ef5919cb92 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -2,12 +2,21 @@ #define _PAGECACHE_H
#ifdef CONFIG_SHRINK_PAGECACHE +enum page_cache_reclaim_flag { + PAGE_CACHE_RECLAIM_NO_UNMAP, + PAGE_CACHE_RECLAIM_UNMAP, + PAGE_CACHE_RECLAIM_WRITEPAGE, + PAGE_CACHE_RECLAIM_NR_FLAGS, +}; + extern int pagecache_reclaim_enable; extern int pagecache_limit_ratio; extern int pagecache_reclaim_ratio;
int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +unsigned long __shrink_node_page_cache(int nid, gfp_t mask, + unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); #else #endif
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 33164e19cfa2..1581334429e1 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -4,6 +4,8 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/err.h> +#include <linux/swap.h> +#include <linux/page_cache_limit.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; @@ -142,14 +144,31 @@ static unsigned long node_nr_page_reclaim(int nid) return nr_to_reclaim; }
-static void shrink_node_page_cache(int nid) +static void shrink_node_page_cache(int nid, gfp_t mask) { + int i; unsigned long nr_to_reclaim; + unsigned long nr_reclaimed; + enum page_cache_reclaim_flag flag;
nr_to_reclaim = node_nr_page_reclaim(nid); + if (nr_to_reclaim <= 0) + return; + + flag = 0; + for (i = PAGE_CACHE_RECLAIM_NO_UNMAP; + i < PAGE_CACHE_RECLAIM_NR_FLAGS; i++) { + nr_reclaimed = __shrink_node_page_cache(nid, mask, nr_to_reclaim, flag); + nr_to_reclaim -= nr_reclaimed; + + if (nr_to_reclaim <= 0) + break; + + flag |= i; + } }
-static void shrink_page_cache(void) +static void shrink_page_cache(gfp_t mask) { int nid;
@@ -157,7 +176,7 @@ static void shrink_page_cache(void) return;
for_each_node_state(nid, N_MEMORY) - shrink_node_page_cache(nid); + shrink_node_page_cache(nid, mask); }
static DECLARE_COMPLETION(setup_done); @@ -173,7 +192,7 @@ static int pagecache_limitd(void *arg) set_freezable(); for (;;) { try_to_freeze(); - shrink_page_cache(); + shrink_page_cache(GFP_KERNEL | __GFP_HIGHMEM);
prepare_to_wait(pagecache_limitd_wait_queue[nid], &wait, TASK_INTERRUPTIBLE); diff --git a/mm/vmscan.c b/mm/vmscan.c index 718840df14e1..732356256b26 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -64,6 +64,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h>
+#ifdef CONFIG_SHRINK_PAGECACHE +#include <linux/page_cache_limit.h> +#endif + struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; @@ -124,6 +128,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1;
+ /* can't shrink slab pages */ + unsigned int no_shrink_slab:1; + /* Allocation order */ s8 order;
@@ -2873,8 +2880,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
shrink_lruvec(lruvec, sc);
- shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); + if (!sc->no_shrink_slab) + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority);
/* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -4586,3 +4594,44 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +#ifdef CONFIG_SHRINK_PAGECACHE +/* + * return the number of reclaimed pages + */ +unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, + enum page_cache_reclaim_flag reclaim_flag) +{ + struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, + .gfp_mask = mask, + .may_swap = 0, + .may_unmap = reclaim_flag | PAGE_CACHE_RECLAIM_UNMAP, + .may_writepage = reclaim_flag | PAGE_CACHE_RECLAIM_WRITEPAGE, + .target_mem_cgroup = NULL, + .priority = DEF_PRIORITY, + .reclaim_idx = MAX_NR_ZONES, + .no_shrink_slab = 1, + }; + + struct zonelist *zonelist = node_zonelist(nid, __GFP_THISNODE); + struct reclaim_state *old_rs = current->reclaim_state; + unsigned long nr_reclaimed; + unsigned int noreclaim_flag; + + if (!(mask & __GFP_RECLAIM)) + return 0; + + noreclaim_flag = memalloc_noreclaim_save(); + fs_reclaim_acquire(sc.gfp_mask); + current->reclaim_state = NULL; + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + current->reclaim_state = old_rs; + fs_reclaim_release(sc.gfp_mask); + memalloc_noreclaim_restore(noreclaim_flag); + + return nr_reclaimed; +} +#endif
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK
------------------------------------------
kthread and page cache limit should be reconfigured when memory hot plug and hot unplug.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 4 +++ mm/memory_hotplug.c | 3 +++ mm/page_cache_limit.c | 45 +++++++++++++++++++++++++------- 3 files changed, 43 insertions(+), 9 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index e4ef5919cb92..7906b12af947 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -17,7 +17,11 @@ int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); +void kpagecache_limitd_stop(int nid); +int kpagecache_limitd_run(int nid); #else +static inline void kpagecache_limitd_stop(int nid) {} +static inline int kpagecache_limitd_run(int nid) { return 0; } #endif
#endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a009b6395b02..a8f0d804a758 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -38,6 +38,7 @@ #include <linux/rmap.h>
#include <asm/tlbflush.h> +#include <linux/page_cache_limit.h>
#include "internal.h" #include "shuffle.h" @@ -735,6 +736,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid); kcompactd_run(nid); + kpagecache_limitd_run(nid);
writeback_set_ratelimit();
@@ -1491,6 +1493,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) if (arg.status_change_nid >= 0) { kswapd_stop(node); kcompactd_stop(node); + kpagecache_limitd_stop(node); }
writeback_set_ratelimit(); diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 1581334429e1..0a3098c9bb33 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -31,18 +31,27 @@ static unsigned long get_node_total_pages(int nid) return managed_pages; }
-static void setup_pagecache_limit(void) +static void setup_node_pagecache_limit(int nid) { - int i; unsigned long node_total_pages;
+ node_total_pages = get_node_total_pages(nid); + node_pagecache_limit_pages[nid] = node_total_pages * pagecache_limit_ratio / 100; +} + +#define ALL_NODE (-1) +static void setup_pagecache_limit(int nid) +{ + int i; + pagecache_limit_pages = pagecache_limit_ratio * totalram_pages() / 100;
- for (i = 0; i < MAX_NUMNODES; i++) { - node_total_pages = get_node_total_pages(i); - node_pagecache_limit_pages[i] = node_total_pages * - pagecache_limit_ratio / 100; - } + if (nid != ALL_NODE) + setup_node_pagecache_limit(nid); + + else + for (i = 0; i < MAX_NUMNODES; i++) + setup_node_pagecache_limit(i); }
int proc_page_cache_limit(struct ctl_table *table, int write, @@ -53,7 +62,7 @@ int proc_page_cache_limit(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && !ret) - setup_pagecache_limit(); + setup_pagecache_limit(ALL_NODE);
return ret; } @@ -72,6 +81,8 @@ void kpagecache_limitd_stop(int nid) kvfree(pagecache_limitd_wait_queue[nid]); pagecache_limitd_wait_queue[nid] = NULL; } + + setup_pagecache_limit(nid); }
static void wakeup_kpagecache_limitd(int nid) @@ -207,7 +218,7 @@ static int pagecache_limitd(void *arg) return 0; }
-int kpagecache_limitd_run(int nid) +static int __kpagecache_limitd_run(int nid) { int ret = 0; wait_queue_head_t *queue_head = NULL; @@ -236,6 +247,22 @@ int kpagecache_limitd_run(int nid) return ret; }
+int kpagecache_limitd_run(int nid) +{ + int ret; + + if (nid < 0 || nid >= MAX_NUMNODES) + return -EINVAL; + + ret = __kpagecache_limitd_run(nid); + if (ret) + return ret; + + setup_pagecache_limit(nid); + + return 0; +} + static int __init kpagecache_limitd_init(void) { int nid;
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK
------------------------------------------
Add hooks in function add_to page_cache and add_to_page_cache_lru
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 2 ++ include/linux/pagemap.h | 2 ++ mm/filemap.c | 2 ++ 3 files changed, 6 insertions(+)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 7906b12af947..2df08a0604d8 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -19,9 +19,11 @@ unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); void kpagecache_limitd_stop(int nid); int kpagecache_limitd_run(int nid); +void wakeup_all_kpagecache_limitd(void); #else static inline void kpagecache_limitd_stop(int nid) {} static inline int kpagecache_limitd_run(int nid) { return 0; } +static inline void wakeup_all_kpagecache_limitd(void) {} #endif
#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0bfa9cce6589..dbb25f1dc2e9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -15,6 +15,7 @@ #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> +#include <linux/page_cache_limit.h>
struct pagevec;
@@ -777,6 +778,7 @@ static inline int add_to_page_cache(struct page *page, { int error;
+ wakeup_all_kpagecache_limitd(); __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) diff --git a/mm/filemap.c b/mm/filemap.c index ef611eb34aa7..f9e4760b9cbd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,7 @@ #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> +#include <linux/page_cache_limit.h> #include "internal.h"
#define CREATE_TRACE_POINTS @@ -923,6 +924,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret;
+ wakeup_all_kpagecache_limitd(); __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK
------------------------------------------
Add a proc interface to drop cache in target node.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/drop_caches.c | 36 ++++++++++++++++++++++++++++++-- include/linux/fs.h | 9 ++++++++ include/linux/page_cache_limit.h | 3 +++ kernel/sysctl.c | 8 +++++++ mm/page_cache_limit.c | 2 ++ mm/truncate.c | 34 +++++++++++++++++++++++++++--- 6 files changed, 87 insertions(+), 5 deletions(-)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c index f00fcc4a4f72..ff70ef7674e3 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -9,12 +9,17 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> + +#ifdef CONFIG_SHRINK_PAGECACHE +#include <linux/page_cache_limit.h> +#endif + #include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb, void *unused) +static void drop_pagecache_sb(struct super_block *sb, void *nid) { struct inode *inode, *toput_inode = NULL;
@@ -35,7 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock);
- invalidate_mapping_pages(inode->i_mapping, 0, -1); + if (!nid) + invalidate_mapping_pages(inode->i_mapping, 0, -1); + else + node_invalidate_mapping_pages(inode->i_mapping, + *(int *)nid, 0, -1); + iput(toput_inode); toput_inode = inode;
@@ -74,3 +84,25 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, } return 0; } + +#ifdef CONFIG_SHRINK_PAGECACHE +int proc_shrink_node_caches(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + if (node_to_shrink >= MAX_NUMNODES) + return -EINVAL; + + if (!node_isset(node_to_shrink, node_states[N_MEMORY])) + return 0; + + iterate_supers(drop_pagecache_sb, &node_to_shrink); + + return 0; +} +#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index dd3379e76525..8f6704a3f596 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2613,6 +2613,15 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end);
+#ifdef CONFIG_SHRINK_PAGECACHE +unsigned long node_invalidate_mapping_pages(struct address_space *mapping, + int nid, pgoff_t start, pgoff_t end); +#else +static inline unsigned long +node_invalidate_mapping_pages(struct address_space *mapping, int nid, + pgoff_t start, pgoff_t end) { return 0; } +#endif + void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 2df08a0604d8..442d6126c529 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -12,6 +12,7 @@ enum page_cache_reclaim_flag { extern int pagecache_reclaim_enable; extern int pagecache_limit_ratio; extern int pagecache_reclaim_ratio; +extern int node_to_shrink;
int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -20,6 +21,8 @@ unsigned long __shrink_node_page_cache(int nid, gfp_t mask, void kpagecache_limitd_stop(int nid); int kpagecache_limitd_run(int nid); void wakeup_all_kpagecache_limitd(void); +int proc_shrink_node_caches(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); #else static inline void kpagecache_limitd_stop(int nid) {} static inline int kpagecache_limitd_run(int nid) { return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b3ee0deaa8dd..261787cebd8e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3224,6 +3224,14 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = (void *)&one_hundred, }, + { + .procname = "node_drop_caches", + .data = &node_to_shrink, + .maxlen = sizeof(node_to_shrink), + .mode = 0600, + .proc_handler = proc_shrink_node_caches, + .extra1 = SYSCTL_ZERO, + }, #endif { } }; diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 0a3098c9bb33..0ccc1388c8dc 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -5,11 +5,13 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/swap.h> +#include <linux/fs.h> #include <linux/page_cache_limit.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; int pagecache_reclaim_ratio; +int node_to_shrink;
static unsigned long pagecache_limit_pages; static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; diff --git a/mm/truncate.c b/mm/truncate.c index 98d08f197766..6d4887a43cd8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -465,7 +465,7 @@ void truncate_inode_pages_final(struct address_space *mapping) EXPORT_SYMBOL(truncate_inode_pages_final);
static unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec, int nid) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -487,6 +487,10 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, page); continue; } + + if (nid != NUMA_NO_NODE && page_to_nid(page) != nid) + continue; + index += thp_nr_pages(page) - 1;
ret = invalidate_inode_page(page); @@ -529,10 +533,34 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return __invalidate_mapping_pages(mapping, start, end, NULL); + return __invalidate_mapping_pages(mapping, start, end, NULL, NUMA_NO_NODE); } EXPORT_SYMBOL(invalidate_mapping_pages);
+ +/** + * node_invalidate_mapping_pages - Invalidate all the unlocked pages in @nid of one inode + * @mapping: the address_space which holds the pages to invalidate + * @nid: pages belong to this node will be invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * node_invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + * + * Return: the number of the pages that were invalidated + */ +#ifdef CONFIG_SHRINK_PAGECACHE +unsigned long node_invalidate_mapping_pages(struct address_space *mapping, + int nid, pgoff_t start, pgoff_t end) +{ + return __invalidate_mapping_pages(mapping, start, end, NULL, nid); +} +#endif /** * This helper is similar with the above one, except that it accounts for pages * that are likely on a pagevec and count them in @nr_pagevec, which will used by @@ -541,7 +569,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages); void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { - __invalidate_mapping_pages(mapping, start, end, nr_pagevec); + __invalidate_mapping_pages(mapping, start, end, nr_pagevec, NUMA_NO_NODE); }
/*
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4K272 CVE: NA
-------------------------------------------------
We add label out_free_rdtgrp for handling error branch when it happened before rmid and closid allocation, in case of reusing rdtgrp after freeing.
Fixes: 0b16164dc7a9 ("arm64/mpam: Remap reqpartid,pmg to rmid and intpartid to closid") Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com --- fs/resctrlfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/resctrlfs.c b/fs/resctrlfs.c index e1c996ff4e79..7ca9fe3ee4a4 100644 --- a/fs/resctrlfs.c +++ b/fs/resctrlfs.c @@ -749,7 +749,7 @@ static int mkdir_resctrl_prepare(struct kernfs_node *parent_kn, ret = closid_alloc(); if (ret < 0) { rdt_last_cmd_puts("out of CLOSIDs\n"); - goto out_unlock; + goto out_free_rdtgrp; } rdtgrp->closid.intpartid = ret; } @@ -819,10 +819,11 @@ static int mkdir_resctrl_prepare(struct kernfs_node *parent_kn, kernfs_remove(rdtgrp->kn); out_free_rmid: rmid_free(rdtgrp->mon.rmid); - kfree(rdtgrp); out_free_closid: if (rdtgrp->type == RDTCTRL_GROUP) closid_free(rdtgrp->closid.intpartid); +out_free_rdtgrp: + kfree(rdtgrp); out_unlock: resctrl_group_kn_unlock(prgrp_kn); return ret;
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: feature bugzilla: 34278, https://gitee.com/openeuler/kernel/issues/I4K27D CVE: NA
-------------------------------------------------
Proximity domain of Memory MSC node cannot be treated as node id for components' index, we should use acpi_map_pxm_to_node() to get the exact node id anyway, for instance, after DIE interleaving, we can only use node id instead, for pxm is discontinuous at this time.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com --- drivers/acpi/arm64/mpam.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-)
diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c index 6c238f5a5c5a..51419473f63b 100644 --- a/drivers/acpi/arm64/mpam.c +++ b/drivers/acpi/arm64/mpam.c @@ -71,42 +71,17 @@ acpi_mpam_label_cache_component_id(struct acpi_table_header *table_hdr, return 0; }
-/** - * acpi_mpam_label_memory_component_id() - Use proximity_domain id to - * label mpam memory node, which be signed by @component_id. - * @proximity_domain: proximity_domain of ACPI MPAM memory node - * @component_id: The id labels the structure mpam_node memory - */ -static int acpi_mpam_label_memory_component_id(u8 proximity_domain, - u32 *component_id) -{ - u32 nid = (u32)proximity_domain; - - if (nid >= nr_online_nodes) { - pr_err_once("Invalid proximity domain\n"); - return -EINVAL; - } - - *component_id = nid; - return 0; -} - static int __init acpi_mpam_parse_memory(struct acpi_mpam_header *h) { - int ret; u32 component_id; struct mpam_device *dev; struct acpi_mpam_node_memory *node = (struct acpi_mpam_node_memory *)h;
- ret = acpi_mpam_label_memory_component_id(node->proximity_domain, - &component_id); - if (ret) { - pr_err("Failed to label memory component id\n"); - return -EINVAL; - } + component_id = acpi_map_pxm_to_node(node->proximity_domain); + if (component_id == NUMA_NO_NODE) + component_id = 0;
- dev = mpam_device_create_memory(component_id, - node->header.base_address); + dev = mpam_device_create_memory(component_id, node->header.base_address); if (IS_ERR(dev)) { pr_err("Failed to create memory node\n"); return -EINVAL;
From: yangerkun yangerkun@huawei.com
hulk inclusion category: bugfix bugzilla: 185799, https://gitee.com/openeuler/kernel/issues/I4JWYM CVE: NA
---------------------------
We use lockref for dentry reference without notice that so many negative dentry under one dir can lead to overflow of lockref. This can lead to system crash if we do this under root dir.
Since there is not a perfect solution, we just limit max number of dentry count up to INT_MAX / 2. Also, it will cost a lot of time from INT_MAX / 2 to INT_MAX, so we no need to do this under protection of dentry lock.
Also, we limit the FILES_MAX to INT_MAX / 2, since a lot open for same file can lead to overflow too.
Changelog: v1->v2: add a function to do check / add a Macro to mean INT_MAX / 2
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Miao Xie miaoxie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Conflicts: fs/dcache.c Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/dcache.c | 32 ++++++++++++++++++++++++++++---- fs/filescontrol.c | 2 +- include/linux/fs.h | 3 +++ 3 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c index ea0485861d93..185d71a1c05b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1774,6 +1774,18 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) return dentry; }
+static inline bool d_forbid_overflow(struct dentry *dentry) +{ + if (unlikely(d_count(dentry) >= D_COUNT_MAX)) { + shrink_dcache_parent(dentry); + + if (d_count(dentry) >= D_COUNT_MAX) + return false; + } + + return true; +} + /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate @@ -1785,9 +1797,15 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { - struct dentry *dentry = __d_alloc(parent->d_sb, name); + struct dentry *dentry = NULL; + + if (unlikely(!d_forbid_overflow(parent))) + goto out; + + dentry = __d_alloc(parent->d_sb, name); if (!dentry) - return NULL; + goto out; + spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1797,7 +1815,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_parent = parent; list_add(&dentry->d_child, &parent->d_subdirs); spin_unlock(&parent->d_lock); - +out: return dentry; } EXPORT_SYMBOL(d_alloc); @@ -1810,11 +1828,17 @@ EXPORT_SYMBOL(d_alloc_anon);
struct dentry *d_alloc_cursor(struct dentry * parent) { - struct dentry *dentry = d_alloc_anon(parent->d_sb); + struct dentry *dentry = NULL; + + if (unlikely(!d_forbid_overflow(parent))) + goto out; + + dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } +out: return dentry; }
diff --git a/fs/filescontrol.c b/fs/filescontrol.c index 4ad500f40025..fdd557a246be 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -27,7 +27,7 @@ #include <linux/sched/signal.h> #include <linux/module.h>
-#define FILES_MAX ULONG_MAX +#define FILES_MAX D_COUNT_MAX #define FILES_MAX_STR "max"
static bool no_acct; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8f6704a3f596..243a0987ca2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -44,6 +44,9 @@ #include <asm/byteorder.h> #include <uapi/linux/fs.h>
+#define D_COUNT_MAX (INT_MAX / 2) + + struct backing_dev_info; struct bdi_writeback; struct bio;
From: yangerkun yangerkun@huawei.com
hulk inclusion category: bugfix bugzilla: 185805, https://gitee.com/openeuler/kernel/issues/I4JX0L CVE: NA
---------------------------
Parallel thread to add negative dentry under root dir. Sometimes later, 'systemctl daemon-reload' will report softlockup since __fsnotify_update_child_dentry_flags need update all child under root dentry without distinguish does it active or not. It will waste so long time with catching d_lock of root dentry. And other thread try to spin_lock d_lock will run overtime.
Limit negative dentry under dir can avoid this.
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Miao Xie miaoxie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Conflicts: fs/dcache.c include/linux/dcache.h Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/dcache.c | 43 ++++++++++++++++++++++++++++++++++++++++-- include/linux/dcache.h | 4 ++++ 2 files changed, 45 insertions(+), 2 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c index 185d71a1c05b..f5b78cc80a00 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -314,10 +314,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, unsigned type_flags) { unsigned flags; + struct dentry *parent; + + parent = dentry->d_parent; + if ((dentry->d_flags & DCACHE_NEGATIVE_ACCOUNT) && parent) { + WARN_ON(!inode); + atomic_dec(&parent->d_neg_dnum); + }
dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); - flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU | + DCACHE_NEGATIVE_ACCOUNT); flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } @@ -336,6 +344,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(dentry->d_flags & DCACHE_NEGATIVE_ACCOUNT); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->u.count))) { @@ -573,8 +582,14 @@ static void __dentry_kill(struct dentry *dentry) /* if it was on the hash then remove it */ __d_drop(dentry); dentry_unlist(dentry, parent); - if (parent) + if (parent) { + if (dentry->d_flags & DCACHE_NEGATIVE_ACCOUNT) { + atomic_dec(&parent->d_neg_dnum); + dentry->d_flags &= ~DCACHE_NEGATIVE_ACCOUNT; + } + spin_unlock(&parent->d_lock); + } if (dentry->d_inode) dentry_unlink_inode(dentry); else @@ -634,6 +649,8 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
static inline bool retain_dentry(struct dentry *dentry) { + struct dentry *parent; + WARN_ON(d_in_lookup(dentry));
/* Unreachable? Get rid of it */ @@ -651,6 +668,27 @@ static inline bool retain_dentry(struct dentry *dentry) if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) return false;
+ if (unlikely(!dentry->d_parent)) + goto noparent; + + parent = dentry->d_parent; + /* Return false if it's negative */ + WARN_ON((atomic_read(&parent->d_neg_dnum) < 0)); + if (!dentry->d_inode) { + if (!(dentry->d_flags & DCACHE_NEGATIVE_ACCOUNT)) { + unsigned int flags = READ_ONCE(dentry->d_flags); + + flags |= DCACHE_NEGATIVE_ACCOUNT; + WRITE_ONCE(dentry->d_flags, flags); + atomic_inc(&parent->d_neg_dnum); + } + } + + if (!dentry->d_inode && + atomic_read(&parent->d_neg_dnum) >= NEG_DENTRY_LIMIT) + return false; + +noparent: /* retain; LRU fodder */ dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) @@ -1749,6 +1787,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; + atomic_set(&dentry->d_neg_dnum, 0); dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6f95c3300cbb..edb5efeff11a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -84,6 +84,7 @@ extern struct dentry_stat_t dentry_stat; # endif #endif
+#define NEG_DENTRY_LIMIT 16384 #define d_lock d_lockref.lock
struct dentry { @@ -118,6 +119,8 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; + /* negative dentry under this dentry, if it's dir */ + atomic_t d_neg_dnum; } __randomize_layout;
/* @@ -219,6 +222,7 @@ struct dentry_operations { #define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */ #define DCACHE_DENTRY_CURSOR 0x20000000 #define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */ +#define DCACHE_NEGATIVE_ACCOUNT 0x80000000
extern seqlock_t rename_lock;
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
There are certain devices like specialized accelerator, GPU cards, network cards, FPGA cards etc which might contain onboard memory which is coherent along with the existing system RAM while being accessed either from the CPU or from the device. They share some similar properties with that of normal system RAM but at the same time can also be different with respect to system RAM.
User applications might be interested in using this kind of coherent device memory explicitly or implicitly along side the system RAM utilizing all possible core memory functions like anon mapping (LRU), file mapping (LRU), page cache (LRU), driver managed (non LRU), HW poisoning, NUMA migrations etc. To achieve this kind of tight integration with core memory subsystem, the device onboard coherent memory must be represented as a memory only NUMA node. At the same time arch must export some kind of a function to identify of this node as a coherent device memory not any other regular cpu less memory only NUMA node.
After achieving the integration with core memory subsystem coherent device memory might still need some special consideration inside the kernel. There can be a variety of coherent memory nodes with different expectations from the core kernel memory. But right now only one kind of special treatment is considered which requires certain isolation.
Now consider the case of a coherent device memory node type which requires isolation. This kind of coherent memory is onboard an external device attached to the system through a link where there is always a chance of a link failure taking down the entire memory node with it. More over the memory might also have higher chance of ECC failure as compared to the system RAM. Hence allocation into this kind of coherent memory node should be regulated. Kernel allocations must not come here. Normal user space allocations too should not come here implicitly (without user application knowing about it). This summarizes isolation requirement of certain kind of coherent device memory node as an example. There can be different kinds of isolation requirement also.
Some coherent memory devices might not require isolation altogether after all. Then there might be other coherent memory devices which might require some other special treatment after being part of core memory representation . For now, will look into isolation seeking coherent device memory node not the other ones.
To implement the integration as well as isolation, the coherent memory node must be present in N_MEMORY and a new N_COHERENT_DEVICE node mask inside the node_states[] array. During memory hotplug operations, the new nodemask N_COHERENT_DEVICE is updated along with N_MEMORY for these coherent device memory nodes. This also creates the following new sysfs based interface to list down all the coherent memory nodes of the system.
/sys/devices/system/node/is_cdm_node
Architectures must export function arch_check_node_cdm() which identifies any coherent device memory node in case they enable CONFIG_COHERENT_DEVICE.
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/mm/numa.c | 7 ++++ drivers/base/node.c | 6 ++++ include/linux/nodemask.h | 76 +++++++++++++++++++++++++++++++++++++++- mm/Kconfig | 7 ++++ mm/page_alloc.c | 8 +++-- 5 files changed, 101 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index a8303bc6b62a..dae7179ba609 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -25,6 +25,13 @@ static int numa_distance_cnt; static u8 *numa_distance; bool numa_off;
+#ifdef CONFIG_COHERENT_DEVICE +inline int arch_check_node_cdm(int nid) +{ + return 0; +} +#endif + static __init int numa_parse_early_param(char *opt) { if (!opt) diff --git a/drivers/base/node.c b/drivers/base/node.c index 21965de8538b..fecfac25cf16 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -1016,6 +1016,9 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), +#ifdef CONFIG_COHERENT_DEVICE + [N_COHERENT_DEVICE] = _NODE_ATTR(is_cdm_node, N_COHERENT_DEVICE), +#endif };
static struct attribute *node_state_attrs[] = { @@ -1028,6 +1031,9 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, +#ifdef CONFIG_COHERENT_DEVICE + &node_state_attr[N_COHERENT_DEVICE].attr.attr, +#endif NULL };
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index ac398e143c9a..90ea204cc059 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -397,9 +397,12 @@ enum node_states { #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif - N_MEMORY, /* The node has memory(regular, high, movable) */ + N_MEMORY, /* The node has memory(regular, high, movable, cdm) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ +#ifdef CONFIG_COHERENT_DEVICE + N_COHERENT_DEVICE, /* The node has CDM memory */ +#endif NR_NODE_STATES };
@@ -503,6 +506,77 @@ static inline int node_random(const nodemask_t *mask) } #endif
+#ifdef CONFIG_COHERENT_DEVICE +extern int arch_check_node_cdm(int nid); + +static inline nodemask_t system_mem_nodemask(void) +{ + nodemask_t system_mem; + + nodes_clear(system_mem); + nodes_andnot(system_mem, node_states[N_MEMORY], + node_states[N_COHERENT_DEVICE]); + return system_mem; +} + +static inline bool is_cdm_node(int node) +{ + return node_isset(node, node_states[N_COHERENT_DEVICE]); +} + +static inline bool nodemask_has_cdm(nodemask_t mask) +{ + int node, i; + + node = first_node(mask); + for (i = 0; i < nodes_weight(mask); i++) { + if (is_cdm_node(node)) + return true; + node = next_node(node, mask); + } + return false; +} + +static inline void node_set_state_cdm(int node) +{ + if (arch_check_node_cdm(node)) + node_set_state(node, N_COHERENT_DEVICE); +} + +static inline void node_clear_state_cdm(int node) +{ + if (arch_check_node_cdm(node)) + node_clear_state(node, N_COHERENT_DEVICE); +} + +#else + +static inline int arch_check_node_cdm(int nid) { return 0; } + +static inline nodemask_t system_mem_nodemask(void) +{ + return node_states[N_MEMORY]; +} + +static inline bool is_cdm_node(int node) +{ + return false; +} + +static inline bool nodemask_has_cdm(nodemask_t mask) +{ + return false; +} + +static inline void node_set_state_cdm(int node) +{ +} + +static inline void node_clear_state_cdm(int node) +{ +} +#endif /* CONFIG_COHERENT_DEVICE */ + #define node_online_map node_states[N_ONLINE] #define node_possible_map node_states[N_POSSIBLE]
diff --git a/mm/Kconfig b/mm/Kconfig index f565fc82c200..8207683afaf2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -145,6 +145,13 @@ config NUMA_KEEP_MEMINFO config MEMORY_ISOLATION bool
+config COHERENT_DEVICE + bool "coherent device memory" + def_bool n + depends on CPUSETS && ARM64 && NUMA + help + Enable coherent device memory (CDM) support. + # # Only be set on architectures that have completely implemented memory hotplug # feature. If you are not sure, don't touch it. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4528a50690f2..308b570cdcec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7355,8 +7355,10 @@ static unsigned long __init early_calculate_totalpages(void) unsigned long pages = end_pfn - start_pfn;
totalpages += pages; - if (pages) + if (pages) { + node_set_state_cdm(nid); node_set_state(nid, N_MEMORY); + } } return totalpages; } @@ -7694,8 +7696,10 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid);
/* Any memory on that node */ - if (pgdat->node_present_pages) + if (pgdat->node_present_pages) { + node_set_state_cdm(nid); node_set_state(nid, N_MEMORY); + } check_for_memory(pgdat, nid); }
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
Kernel allocation to CDM node has already been prevented by putting it's entire memory in ZONE_MOVABLE. But the CDM nodes must also be isolated from implicit allocations happening on the system.
Any isolation seeking CDM node requires isolation from implicit memory allocations from user space but at the same time there should also have an explicit way to do the memory allocation.
Platform node's both zonelists are fundamental to where the memory comes from when there is an allocation request. In order to achieve these two objectives as stated above, zonelists building process has to change as both zonelists (i.e FALLBACK and NOFALLBACK) gives access to the node's memory zones during any kind of memory allocation. The following changes are implemented in this regard.
* CDM node's zones are not part of any other node's FALLBACK zonelist * CDM node's FALLBACK list contains it's own memory zones followed by all system RAM zones in regular order as before * CDM node's zones are part of it's own NOFALLBACK zonelist
These above changes ensure the following which in turn isolates the CDM nodes as desired.
* There wont be any implicit memory allocation ending up in the CDM node * Only __GFP_THISNODE marked allocations will come from the CDM node * CDM node memory can be allocated through mbind(MPOL_BIND) interface * System RAM memory will be used as fallback option in regular order in case the CDM memory is insufficient during targted allocation request
Sample zonelist configuration:
[NODE (0)] RAM ZONELIST_FALLBACK (0xc00000000140da00) (0) (node 0) (DMA 0xc00000000140c000) (1) (node 1) (DMA 0xc000000100000000) ZONELIST_NOFALLBACK (0xc000000001411a10) (0) (node 0) (DMA 0xc00000000140c000) [NODE (1)] RAM ZONELIST_FALLBACK (0xc000000100001a00) (0) (node 1) (DMA 0xc000000100000000) (1) (node 0) (DMA 0xc00000000140c000) ZONELIST_NOFALLBACK (0xc000000100005a10) (0) (node 1) (DMA 0xc000000100000000) [NODE (2)] CDM ZONELIST_FALLBACK (0xc000000001427700) (0) (node 2) (Movable 0xc000000001427080) (1) (node 0) (DMA 0xc00000000140c000) (2) (node 1) (DMA 0xc000000100000000) ZONELIST_NOFALLBACK (0xc00000000142b710) (0) (node 2) (Movable 0xc000000001427080) [NODE (3)] CDM ZONELIST_FALLBACK (0xc000000001431400) (0) (node 3) (Movable 0xc000000001430d80) (1) (node 0) (DMA 0xc00000000140c000) (2) (node 1) (DMA 0xc000000100000000) ZONELIST_NOFALLBACK (0xc000000001435410) (0) (node 3) (Movable 0xc000000001430d80) [NODE (4)] CDM ZONELIST_FALLBACK (0xc00000000143b100) (0) (node 4) (Movable 0xc00000000143aa80) (1) (node 0) (DMA 0xc00000000140c000) (2) (node 1) (DMA 0xc000000100000000) ZONELIST_NOFALLBACK (0xc00000000143f110) (0) (node 4) (Movable 0xc00000000143aa80)
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_alloc.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 308b570cdcec..9bf6ce119d6d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5981,6 +5981,16 @@ static void build_zonelists(pg_data_t *pgdat)
memset(node_order, 0, sizeof(node_order)); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { +#ifdef CONFIG_COHERENT_DEVICE + /* + * CDM node's own zones should not be part of any other + * node's fallback zonelist but only it's own fallback + * zonelist. + */ + if (is_cdm_node(node) && (pgdat->node_id != node)) + continue; +#endif + /* * We don't want to pressure a particular node. * So adding penalty to the first node in same
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
This implements allocation isolation for CDM nodes in buddy allocator by discarding CDM memory zones all the time except in the cases where the gfp flag has got __GFP_THISNODE or the nodemask contains CDM nodes in cases where it is non NULL (explicit allocation request in the kernel or user process MPOL_BIND policy based requests).
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_alloc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9bf6ce119d6d..9ffa2badb706 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3812,6 +3812,21 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct page *page; unsigned long mark;
+ /* + * CDM nodes get skipped if the requested gfp flag + * does not have __GFP_THISNODE set or the nodemask + * does not have any CDM nodes in case the nodemask + * is non NULL (explicit allocation requests from + * kernel or user process MPOL_BIND policy which has + * CDM nodes). + */ + if (is_cdm_node(zone->zone_pgdat->node_id)) { + if (!(gfp_mask & __GFP_THISNODE)) { + if (!ac->nodemask) + continue; + } + } + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !__cpuset_zone_allowed(zone, gfp_mask))
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
Mark all the applicable VMAs with VM_CDM explicitly during mbind(MPOL_BIND) call if the user provided nodemask has a CDM node.
Mark the corresponding VMA with VM_CDM flag if the allocated page happens to be from a CDM node. This can be expensive from performance stand point. There are multiple checks to avoid an expensive page_to_nid lookup but it can be optimized further.
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm.h | 5 +++++ mm/mempolicy.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h index 656f524ba7d3..07ea9972c4a9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -282,6 +282,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ + +#ifdef CONFIG_COHERENT_DEVICE +#define VM_CDM 0x00800000 /* Contains coherent device memory */ +#endif + #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a4c07466d65f..6b6a5f7ce211 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -190,6 +190,42 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, nodes_onto(*ret, tmp, *rel); }
+#ifdef CONFIG_COHERENT_DEVICE +static inline void set_vm_cdm(struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_CDM; +} + +static inline void clr_vm_cdm(struct vm_area_struct *vma) +{ + vma->vm_flags &= ~VM_CDM; +} + +static void mark_vma_cdm(nodemask_t *nmask, + struct page *page, struct vm_area_struct *vma) +{ + if (!page || !vma) + return; + + if (vma->vm_flags & VM_CDM) + return; + + if (nmask && !nodemask_has_cdm(*nmask)) + return; + + if (is_cdm_node(page_to_nid(page))) + vma->vm_flags |= VM_CDM; +} +#else +static inline void set_vm_cdm(struct vm_area_struct *vma) { } +static inline void clr_vm_cdm(struct vm_area_struct *vma) { } + +static void mark_vma_cdm(nodemask_t *nmask, + struct page *page, struct vm_area_struct *vma) +{ +} +#endif + static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) @@ -822,6 +858,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end);
+ if (new_pol && (new_pol->mode == MPOL_BIND) && + nodemask_has_cdm(new_pol->v.nodes)) + set_vm_cdm(vma); + if (mpol_equal(vma_policy(vma), new_pol)) continue;
@@ -2224,6 +2264,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages(gfp, order, preferred_nid, nmask); + mark_vma_cdm(nmask, page, vma); mpol_cond_put(pol); out: return page;
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
Task struct's mems_allowed element decides the final nodemask from which memory can be allocated in the task context irrespective any applicable memory policy. CDM nodes should not be used for user allocations, its one of the overall requirements of it's isolation. So they should not be part of any task's mems_allowed nodemask. System RAM nodemask is used instead of node_states[N_MEMORY] nodemask during mems_allowed initialization and it's update during memory hotlugs.
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/cgroup/cpuset.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1999fcec45c7..e575435811cf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -413,9 +413,11 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) + nodemask_t ram_nodes = system_mem_nodemask(); + + while (!nodes_intersects(cs->effective_mems, ram_nodes)) cs = parent_cs(cs); - nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); + nodes_and(*pmask, cs->effective_mems, ram_nodes); }
/* @@ -3168,7 +3170,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); - new_mems = node_states[N_MEMORY]; + new_mems = system_mem_nodemask();
/* * If subparts_cpus is populated, it is likely that the check below @@ -3291,11 +3293,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = { void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + top_cpuset.mems_allowed = system_mem_nodemask(); top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); - top_cpuset.effective_mems = node_states[N_MEMORY]; + top_cpuset.effective_mems = system_mem_nodemask();
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
Kernel cannot track device memory accesses behind VMAs containing CDM memory. Hence all the VM_CDM marked VMAs should not be part of the auto NUMA migration scheme. This patch also adds a new function is_cdm_vma() to detect any VMA marked with flag VM_CDM.
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mempolicy.h | 14 ++++++++++++++ kernel/sched/fair.c | 3 ++- 2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5f1c74df264d..64ab4398ba90 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -181,6 +181,20 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol);
extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
+#ifdef CONFIG_COHERENT_DEVICE +static inline bool is_cdm_vma(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_CDM) + return true; + return false; +} +#else +static inline bool is_cdm_vma(struct vm_area_struct *vma) +{ + return false; +} +#endif + /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9c34ad6f9a67..1a0cb9a4161e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2836,7 +2836,8 @@ static void task_numa_work(struct callback_head *work) } for (; vma; vma = vma->vm_next) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || - is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + is_vm_hugetlb_page(vma) || is_cdm_vma(vma) || + (vma->vm_flags & VM_MIXEDMAP)) { continue; }
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
VMA containing CDM memory should be excluded from KSM merging. This change makes madvise(MADV_MERGEABLE) request on target VMA to be ignored.
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/ksm.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/mm/ksm.c b/mm/ksm.c index 25b8362a4f89..582c02058baf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -38,6 +38,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/mempolicy.h>
#include <asm/tlbflush.h> #include "internal.h" @@ -2454,6 +2455,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (vma_is_dax(vma)) return 0;
+#ifdef CONFIG_COHERENT_DEVICE + if (is_cdm_vma(vma)) + return 0; +#endif + #ifdef VM_SAO if (*vm_flags & VM_SAO) return 0;
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
__GFP_THISNODE specifically asks the memory to be allocated from the given node. Not all the requests that end up in __alloc_pages_nodemask() are originated from the process context where cpuset makes more sense. The current condition enforces cpuset limitation on every allocation whether originated from process context or not which prevents __GFP_THISNODE mandated allocations to come from the specified node. In context of the coherent device memory node which is isolated from all cpuset nodemask in the system, it prevents the only way of allocation into it which has been changed with this patch.
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9ffa2badb706..4bfb52cb677f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4896,7 +4896,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->nodemask = nodemask; ac->migratetype = gfp_migratetype(gfp_mask);
+#ifdef CONFIG_COHERENT_DEVICE + if (cpusets_enabled() && !(*alloc_gfp & __GFP_THISNODE)) { +#else if (cpusets_enabled()) { +#endif *alloc_gfp |= __GFP_HARDWALL; /* * When we are in the interrupt context, it is irrelevant
From: Anshuman Khandual khandual@linux.vnet.ibm.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
CDM nodes need a way of explicit memory allocation mechanism from the user space. After the previous FALLBACK zonelist rebuilding process changes, the mbind(MPOL_BIND) based allocation request fails on the CDM node. This is because allocation requesting local node's FALLBACK zonelist is selected for further nodemask processing targeted at MPOL_BIND implementation. As the CDM node's zones are not part of any other regular node's FALLBACK zonelist, the allocation simply fails without getting any valid zone. The allocation requesting node is always going to be different than the CDM node which does not have any CPU. Hence MPOL_MBIND implementation must choose given CDM node's FALLBACK zonelist instead of the requesting local node's FALLBACK zonelist. This implements that change.
Signed-off-by: Anshuman Khandual khandual@linux.vnet.ibm.com Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/mempolicy.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6b6a5f7ce211..41b2f0174f02 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1935,6 +1935,13 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); }
+ if (policy->mode == MPOL_BIND) { + if (unlikely(!node_isset(nd, policy->v.nodes))) { + if (is_cdm_node(first_node(policy->v.nodes))) + nd = first_node(policy->v.nodes); + } + } + return nd; }
From: Lijun Fang fanglijun3@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA ---------------------
When the kernel boot, we need to determine DDR or HBM, and add them to nodes by parse cmdline, instead of memory hotplug.
Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 6 ++++++ arch/arm64/include/asm/numa.h | 3 +++ arch/arm64/mm/numa.c | 20 +++++++++++++++++++ 3 files changed, 29 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8eca743da732..d4b9d4a05b7d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -493,6 +493,12 @@ ccw_timeout_log [S390] See Documentation/s390/common_io.rst for details.
+ cdm-nodes= [KNL] + Format: hexadecimal expression + One bit express one node, if the node is HBM, set the + bit to 1. Then transform Binary to hexadecimal. + Example: node1, node2 is HBM, cdm-nodes=0x06. + cgroup_disable= [KNL] Disable a particular controller Format: {name of the controller(s) to disable} The effects of cgroup_disable=foo are: diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index e0c51519e71b..43bfff72a32f 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -19,6 +19,9 @@ extern bool numa_off; extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; void numa_clear_node(unsigned int cpu);
+#ifdef CONFIG_COHERENT_DEVICE +extern nodemask_t cdmmask; +#endif #ifdef CONFIG_DEBUG_PER_CPU_MAPS const struct cpumask *cpumask_of_node(int node); #else diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index dae7179ba609..b2260bb53691 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -26,10 +26,30 @@ static u8 *numa_distance; bool numa_off;
#ifdef CONFIG_COHERENT_DEVICE +nodemask_t cdmmask; + inline int arch_check_node_cdm(int nid) { + return node_isset(nid, cdmmask); +} + +static int __init cdm_nodes_setup(char *s) +{ + int nid; + unsigned long tmpmask; + int err; + + err = kstrtoul(s, 0, &tmpmask); + if (err) + return err; + + for (nid = 0; nid < MAX_NUMNODES; nid++) { + if ((tmpmask >> nid) & 1) + node_set(nid, cdmmask); + } return 0; } +early_param("cdm-nodes", cdm_nodes_setup); #endif
static __init int numa_parse_early_param(char *opt)
From: Lijun Fang fanglijun3@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -----------------
CDM nodes should not be part of mems_allowed, However, It must be allowed to alloc from CDM node, when mpol->mode was MPOL_BIND.
Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm.h | 8 ++++---- mm/hugetlb.c | 16 ++++++++++++---- mm/internal.h | 3 +++ mm/mempolicy.c | 6 +++++- mm/page_alloc.c | 12 ++++++++++-- 5 files changed, 34 insertions(+), 11 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index 07ea9972c4a9..3780281c8112 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -283,10 +283,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
-#ifdef CONFIG_COHERENT_DEVICE -#define VM_CDM 0x00800000 /* Contains coherent device memory */ -#endif - #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ @@ -303,6 +299,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
+#ifdef CONFIG_COHERENT_DEVICE +#define VM_CDM 0x100000000 /* Contains coherent device memory */ +#endif + #ifdef CONFIG_USERSWAP /* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */ #define VM_USWAP 0x2000000000000000 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f553b71f2518..1bbe763dce73 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1091,13 +1091,20 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) }
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, - nodemask_t *nmask) + nodemask_t *nmask, struct mempolicy *mpol) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; int node = NUMA_NO_NODE; + bool mbind_cdmnode = false; + +#ifdef CONFIG_COHERENT_DEVICE + if (is_cdm_node(nid) && ((mpol != NULL && mpol->mode == MPOL_BIND) || + (gfp_mask & __GFP_THISNODE))) + mbind_cdmnode = true; +#endif
zonelist = node_zonelist(nid, gfp_mask);
@@ -1106,7 +1113,8 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { struct page *page;
- if (!cpuset_zone_allowed(zone, gfp_mask)) + if (!cpuset_zone_allowed(zone, gfp_mask) && + mbind_cdmnode == false) continue; /* * no need to ask again on the same node. Pool is node rather than @@ -1152,7 +1160,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask, mpol); if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -2032,7 +2040,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page;
- page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); + page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask, NULL); if (page) { spin_unlock_irq(&hugetlb_lock); return page; diff --git a/mm/internal.h b/mm/internal.h index eb39a9b93db3..9451ba9bbcf3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -593,6 +593,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif +#ifdef CONFIG_COHERENT_DEVICE +#define ALLOC_CDM 0x200 +#endif #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
enum ttu_flags; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 41b2f0174f02..d63181ae4c98 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -274,6 +274,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]);
+#ifdef CONFIG_COHERENT_DEVICE + nodes_or(nsc->mask1, cdmmask, nsc->mask1); +#endif VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ @@ -1915,7 +1918,8 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) + (cpuset_nodemask_valid_mems_allowed(&policy->v.nodes) || + nodemask_has_cdm(policy->v.nodes))) return &policy->v.nodes;
return NULL; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4bfb52cb677f..62c94ea31e17 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3829,7 +3829,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !__cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask) +#ifdef CONFIG_COHERENT_DEVICE + && !(alloc_flags & ALLOC_CDM) +#endif + ) continue; /* * When allocating a page cache page for writing, we @@ -4908,8 +4912,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, */ if (!in_interrupt() && !ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; - else + else { *alloc_flags |= ALLOC_CPUSET; +#ifdef CONFIG_COHERENT_DEVICE + *alloc_flags |= ALLOC_CDM; +#endif + } }
fs_reclaim_acquire(gfp_mask);
From: Lijun Fang fanglijun3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
enable CONFIG_COHERENT_DEVICE in openeuler_defconfig
Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index fe7123dd102a..f23a30a6fb01 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -988,6 +988,7 @@ CONFIG_HAVE_FAST_GUP=y CONFIG_ARCH_KEEP_MEMBLOCK=y CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_ISOLATION=y +CONFIG_COHERENT_DEVICE=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTPLUG_SPARSE=y CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4K6CJ CVE: NA
--------------------------------
We must ensure that the following four instructions are cache-aligned. Otherwise, it will cause problems with the performance of libMicro pread.
1: # uao_user_alternative 9f, str, sttr, xzr, x0, 8 str xzr, [x0], #8 nop subs x1, x1, #8 b.pl 1b
with this patch:
prc thr usecs/call samples errors cnt/samp size pread_z100 1 1 5.88400 807 0 1 102400
The result of pread can range from 5 to 9 depending on the alignment performance of this function.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/lib/clear_user.S | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 48a3a26eff66..db74e9ff1e20 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -23,6 +23,9 @@ SYM_FUNC_START(__arch_clear_user) mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f +#ifdef CONFIG_ARCH_HISI + .align 5 +#endif 1: uao_user_alternative 9f, str, sttr, xzr, x0, 8 subs x1, x1, #8
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: 34546, https://gitee.com/openeuler/kernel/issues/I4JKT1 CVE: NA
-----------------------------------------------
A deadlock caused by logbuf_lock occurs when panic:
a) Panic CPU is running in non-NMI context b) Panic CPU sends out shutdown IPI via NMI vector c) One of the CPUs that we bring down via NMI vector holded logbuf_lock d) Panic CPU try to hold logbuf_lock, then deadlock occurs.
we try to re-init the logbuf_lock in printk_safe_flush_on_panic() to avoid deadlock, but it does not work here, because :
Firstly, it is inappropriate to check num_online_cpus() here. When the CPU bring down via NMI vector, the panic CPU willn't wait too long for other cores to stop, so when this problem occurs, num_online_cpus() may be greater than 1.
Secondly, printk_safe_flush_on_panic() is called after panic notifier callback, so if printk() is called in panic notifier callback, deadlock will still occurs. Eg, if ftrace_dump_on_oops is set, we print some debug information, it will try to hold the logbuf_lock.
To avoid this deadlock, attempt to re-init logbuf_lock from panic CPU before panic_notifier_list callback,
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/printk.h | 5 +++++ kernel/panic.c | 2 ++ kernel/printk/printk.c | 17 +++++++++++++++++ 3 files changed, 24 insertions(+)
diff --git a/include/linux/printk.h b/include/linux/printk.h index fe7eb2351610..de1457e3af3f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -209,6 +209,7 @@ void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack(void) __cold; extern void printk_safe_flush(void); extern void printk_safe_flush_on_panic(void); +extern void zap_locks(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -280,6 +281,10 @@ static inline void printk_safe_flush(void) static inline void printk_safe_flush_on_panic(void) { } + +static inline void zap_locks(void) +{ +} #endif
extern int kptr_restrict; diff --git a/kernel/panic.c b/kernel/panic.c index 332736a72a58..75f07bb57006 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -265,6 +265,8 @@ void panic(const char *fmt, ...) crash_smp_send_stop(); }
+ zap_locks(); + /* * Run any panic handlers, including those that might need to * add information to the kmsg dump output. diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d0df95346ab3..a504ff599d69 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1742,6 +1742,23 @@ static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter;
+void zap_locks(void) +{ + if (raw_spin_is_locked(&logbuf_lock)) { + debug_locks_off(); + raw_spin_lock_init(&logbuf_lock); + + console_suspended = 1; + sema_init(&console_sem, 1); + } + + if (raw_spin_is_locked(&console_owner_lock)) { + raw_spin_lock_init(&console_owner_lock); + console_owner = NULL; + console_waiter = false; + } +} + /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: 34546, https://gitee.com/openeuler/kernel/issues/I4JKT1 CVE: NA
----------------------------------------
There are two problems with the implementation and use of zap_lock().
Firstly, This console_sem does not require reinit in zap_lock(), this is because:
1). printk() itself does try_lock() and skips console handling when the semaphore is not available.
2). panic() tries to push the messages later in console_flush_on_panic(). It ignores the semaphore. Also most console drivers ignore their internal locks because oops_in_progress is set by bust_spinlocks().
Secondly, The situation is more complicated when NMI is not used.
1). Non-stopped CPUs are in unknown state, most likely in a busy loop. Nobody knows whether printk() is repeatedly called in the loop. When it was called, re-initializing any lock would cause double unlock and deadlock.
2). It would be possible to add some more hacks. One problem is that there are two groups of users. One prefer to risk a deadlock and have a chance to see the messages. Others prefer to always reach emergency_restart() and reboot the machine.
Fixes: d0dfaa87c2aa ("printk/panic: Avoid deadlock in printk()") Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/panic.c | 25 +++++++++++++++++++++++++ kernel/printk/printk.c | 3 --- 2 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/kernel/panic.c b/kernel/panic.c index 75f07bb57006..3d75855db4e6 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -265,7 +265,32 @@ void panic(const char *fmt, ...) crash_smp_send_stop(); }
+ /* + * ZAP console related locks when nmi broadcast. If a crash is occurring, + * make sure we can't deadlock. And make sure that we print immediately. + * + * A deadlock caused by logbuf_lock can be occured when panic: + * a) Panic CPU is running in non-NMI context; + * b) Panic CPU sends out shutdown IPI via NMI vector; + * c) One of the CPUs that we bring down via NMI vector holded logbuf_lock; + * d) Panic CPU try to hold logbuf_lock, then deadlock occurs. + * + * At present, only try to solve this problem for the ARCH with NMI, + * by reinit lock, this situation is more complicated when NMI is not + * used. + * 1). Non-stopped CPUs are in unknown state, most likely in a busy loop. + * Nobody knows whether printk() is repeatedly called in the loop. + * When it was called, re-initializing any lock would cause double + * unlock and deadlock. + * + * 2). It would be possible to add some more hacks. One problem is that + * there are two groups of users. One prefer to risk a deadlock and + * have a chance to see the messages. Others prefer to always + * reach emergency_restart() and reboot the machine. + */ +#ifdef CONFIG_X86 zap_locks(); +#endif
/* * Run any panic handlers, including those that might need to diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a504ff599d69..bf58d5777bce 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1747,9 +1747,6 @@ void zap_locks(void) if (raw_spin_is_locked(&logbuf_lock)) { debug_locks_off(); raw_spin_lock_init(&logbuf_lock); - - console_suspended = 1; - sema_init(&console_sem, 1); }
if (raw_spin_is_locked(&console_owner_lock)) {
From: Chen Zhou chenzhou10@huawei.com
hulk inclusion category: bugfix bugzilla: 41832, https://gitee.com/openeuler/kernel/issues/I4JKT1 CVE: NA
-----------------------------------------------
When one cpu panic, the panic cpu send NMI to other cpus, if one of the non-panic cpus is in printk() and get stopped before release console_waiter, the panic cpu may spin waiting.
Here just release console_waiter directly after all non-panic cpus get stopped.
Signed-off-by: Chen Zhou chenzhou10@huawei.com Reviewed-by: Jian Cheng cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/printk/printk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bf58d5777bce..e4328bc341f3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1751,9 +1751,10 @@ void zap_locks(void)
if (raw_spin_is_locked(&console_owner_lock)) { raw_spin_lock_init(&console_owner_lock); - console_owner = NULL; - console_waiter = false; } + + console_owner = NULL; + console_waiter = false; }
/**
From: Chen Zhou chenzhou10@huawei.com
hulk inclusion category: bugfix bugzilla: 34546, https://gitee.com/openeuler/kernel/issues/I4JKT1 CVE: NA
----------------------------------------
When one cpu panic, the panic cpu send NMI to other cpus, if one of the non-panic cpus is in printk() and get stopped in function console_trylock_spinning() before releasing sem->lock, the panic cpu may spin waiting sem->lock in function console_trylock_spinning().
Reinit console_sem in zap_lock() to fix this.
Signed-off-by: Chen Zhou chenzhou10@huawei.com Reviewed-by: Jian Cheng cj.chengjian@huawei.com Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/printk/printk.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e4328bc341f3..69a1be81dd98 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1755,6 +1755,8 @@ void zap_locks(void)
console_owner = NULL; console_waiter = false; + + sema_init(&console_sem, 1); }
/**
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: 34546, https://gitee.com/openeuler/kernel/issues/I4JKT1 CVE: NA
----------------------------------------
Any architecture that involves an NMI should be treated with caution. For example, the X86 architecture and ARM64 enabled PSEUDO NMI.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/printk.h | 4 ++++ kernel/panic.c | 2 +- kernel/printk/printk.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/include/linux/printk.h b/include/linux/printk.h index de1457e3af3f..e6a8ee6db68e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -209,8 +209,12 @@ void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack(void) __cold; extern void printk_safe_flush(void); extern void printk_safe_flush_on_panic(void); +#if defined(CONFIG_X86) || defined(CONFIG_ARM64_PSEUDO_NMI) extern void zap_locks(void); #else +static inline void zap_locks(void) { } +#endif +#else static inline __printf(1, 0) int vprintk(const char *s, va_list args) { diff --git a/kernel/panic.c b/kernel/panic.c index 3d75855db4e6..d991c3b1b559 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -265,6 +265,7 @@ void panic(const char *fmt, ...) crash_smp_send_stop(); }
+#if defined(CONFIG_X86) || defined(CONFIG_ARM64_PSEUDO_NMI) /* * ZAP console related locks when nmi broadcast. If a crash is occurring, * make sure we can't deadlock. And make sure that we print immediately. @@ -288,7 +289,6 @@ void panic(const char *fmt, ...) * have a chance to see the messages. Others prefer to always * reach emergency_restart() and reboot the machine. */ -#ifdef CONFIG_X86 zap_locks(); #endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 69a1be81dd98..729e4ce2decb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1742,6 +1742,7 @@ static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter;
+#if defined(CONFIG_X86) || defined(CONFIG_ARM64_PSEUDO_NMI) void zap_locks(void) { if (raw_spin_is_locked(&logbuf_lock)) { @@ -1758,6 +1759,7 @@ void zap_locks(void)
sema_init(&console_sem, 1); } +#endif
/** * console_lock_spinning_enable - mark beginning of code where another