From: Yu Kuai yukuai3@huawei.com
mainline inclusion from mainline-v5.14-rc1 commit da1e6fe563e62801fa033255f68c0bb9bf8c2c69 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I53BBP backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Return a negative error code from the error handling case instead of 0, as done elsewhere in this function.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Signed-off-by: Miquel Raynal miquel.raynal@bootlin.com Link: https://lore.kernel.org/linux-mtd/20210408133812.1209798-1-yukuai3@huawei.co... Signed-off-by: Yi Yang yiyang13@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/mtd/devices/phram.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c index aa88558c7edb..d7d17a4b61f5 100644 --- a/drivers/mtd/devices/phram.c +++ b/drivers/mtd/devices/phram.c @@ -270,6 +270,7 @@ static int phram_setup(const char *val) if (len == 0 || erasesize == 0 || erasesize > len || erasesize > UINT_MAX || rem) { parse_err("illegal erasesize or len\n"); + ret = -EINVAL; goto error; }
From: Miaohe Lin linmiaohe@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 6401c4eb57f947a49eb144b5b0787cde3318e82e category: bugfix bugzilla: 180689, https://gitee.com/openeuler/kernel/issues/I53CMX CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When failed to try_grab_page, put_dev_pagemap() is missed. So pgmap refcnt will leak in this case. Also we remove the check for pgmap against NULL as it's also checked inside the put_dev_pagemap().
[akpm@linux-foundation.org: simplify, cleanup] [akpm@linux-foundation.org: fix return value]
Link: https://lkml.kernel.org/r/20210807093620.21347-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin linmiaohe@huawei.com Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Reviewed-by: John Hubbard jhubbard@nvidia.com Reviewed-by: Claudio Imbrenda imbrenda@linux.ibm.com Cc: Jan Kara jack@suse.cz Cc: Kirill A. Shutemov kirill.shutemov@linux.intel.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 6401c4eb57f947a49eb144b5b0787cde3318e82e) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/gup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c index ee9c2c39c299..4e9945299fe5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2319,6 +2319,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; + int ret = 1;
do { struct page *page = pfn_to_page(pfn); @@ -2326,21 +2327,22 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; + ret = 0; + break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; + ret = 0; + break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end);
- if (pgmap) - put_dev_pagemap(pgmap); - return 1; + put_dev_pagemap(pgmap); + return ret; }
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
From: Wei Li liwei391@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I53K4K CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Currently, clear_page() uses DC ZVA instruction unconditionally. But it should make sure that DCZID_EL0.DZP, which indicates whether or not use of DC ZVA instruction is prohibited, is zero when using the instruction. Use STNP instead when DCZID_EL0.DZP == 1.
Fixes: f27bb139c387 ("arm64: Miscellaneous library functions") Based-on-patch-by: Reiji Watanabe reijiw@google.com Signed-off-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/cpufeature.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 809dcac24e18..7d110ce5d7f4 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1792,6 +1792,9 @@ static bool has_mor_nontemporal(const struct arm64_cpu_capabilities *entry) static bool can_clearpage_use_stnp(const struct arm64_cpu_capabilities *entry, int scope) { + if (read_sysreg(dczid_el0) & BIT(DCZID_DZP_SHIFT)) + return true; + return use_clearpage_stnp && has_mor_nontemporal(entry); }
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from linux-5.10.106 commit b9a229fd48bfa45edb954c75a57e3931a3da6c5f category: bugfix bugzilla: 186561 https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1199
--------------------------------
[ Upstream commit 4e0f718daf97d47cf7dec122da1be970f145c809 ]
The previous commit 1ade48d0c27d ("ax25: NPD bug when detaching AX25 device") introduce lock_sock() into ax25_kill_by_device to prevent NPD bug. But the concurrency NPD or UAF bug will occur, when lock_sock() or release_sock() dereferences the ax25_cb->sock.
The NULL pointer dereference bug can be shown as below:
ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() | ax25_cb_del() ... | ... | ax25->sk=NULL; lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... |
The root cause is that the sock is set to null before dereference site (1) or (2). Therefore, this patch extracts the ax25_cb->sock in advance, and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() and ensure the value of sock is not null before dereference sites.
The concurrency UAF bug can be shown as below:
ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() ... | ... | sock_put(sk); //FREE lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... |
The root cause is that the sock is released before dereference site (1) or (2). Therefore, this patch uses sock_hold() to increase the refcount of sock and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() in ax25_destroy_socket() and ensure the sock wil not be released before dereference sites.
Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 22278807b3f3..cbedc33f8b27 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -77,6 +77,7 @@ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; + struct sock *sk;
if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; @@ -85,13 +86,15 @@ static void ax25_kill_by_device(struct net_device *dev) again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { + sk = s->sk; + sock_hold(sk); spin_unlock_bh(&ax25_list_lock); - lock_sock(s->sk); + lock_sock(sk); s->ax25_dev = NULL; - release_sock(s->sk); + release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); - + sock_put(sk); /* The entry could have been deleted from the * list meanwhile and thus the next pointer is * no longer valid. Play it safe and restart
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from linux-5.10.106 commit e2201ef32f933944ee02e59205adb566bafcdf91 category: bugfix bugzilla: 186561 https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1199
--------------------------------
[ Upstream commit 71171ac8eb34ce7fe6b3267dce27c313ab3cb3ac ]
When two ax25 devices attempted to establish connection, the requester use ax25_create(), ax25_bind() and ax25_connect() to initiate connection. The receiver use ax25_rcv() to accept connection and use ax25_create_cb() in ax25_rcv() to create ax25_cb, but the ax25_cb->sk is NULL. When the receiver is detaching, a NULL pointer dereference bug caused by sock_hold(sk) in ax25_kill_by_device() will happen. The corresponding fail log is shown below:
=============================================================== BUG: KASAN: null-ptr-deref in ax25_device_event+0xfd/0x290 Call Trace: ... ax25_device_event+0xfd/0x290 raw_notifier_call_chain+0x5e/0x70 dev_close_many+0x174/0x220 unregister_netdevice_many+0x1f7/0xa60 unregister_netdevice_queue+0x12f/0x170 unregister_netdev+0x13/0x20 mkiss_close+0xcd/0x140 tty_ldisc_release+0xc0/0x220 tty_release_struct+0x17/0xa0 tty_release+0x62d/0x670 ...
This patch add condition check in ax25_kill_by_device(). If s->sk is NULL, it will goto if branch to kill device.
Fixes: 4e0f718daf97 ("ax25: improve the incomplete fix to avoid UAF and NPD bugs") Reported-by: Thomas Osterried thomas@osterried.de Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index cbedc33f8b27..e5f6838a235a 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -87,6 +87,13 @@ static void ax25_kill_by_device(struct net_device *dev) ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { sk = s->sk; + if (!sk) { + spin_unlock_bh(&ax25_list_lock); + s->ax25_dev = NULL; + ax25_disconnect(s, ENETUNREACH); + spin_lock_bh(&ax25_list_lock); + goto again; + } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk);
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from linux-5.10.112 commit 145ea8d213e8f46667cd904ae79d17f298750f00 category: bugfix bugzilla: 186561 https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1199
--------------------------------
commit 7ec02f5ac8a5be5a3f20611731243dc5e1d9ba10 upstream.
The ax25_disconnect() in ax25_kill_by_device() is not protected by any locks, thus there is a race condition between ax25_disconnect() and ax25_destroy_socket(). when ax25->sk is assigned as NULL by ax25_destroy_socket(), a NULL pointer dereference bug will occur if site (1) or (2) dereferences ax25->sk.
ax25_kill_by_device() | ax25_release() ax25_disconnect() | ax25_destroy_socket() ... | if(ax25->sk != NULL) | ... ... | ax25->sk = NULL; bh_lock_sock(ax25->sk); //(1) | ... ... | bh_unlock_sock(ax25->sk); //(2)|
This patch moves ax25_disconnect() into lock_sock(), which can synchronize with ax25_destroy_socket() in ax25_release().
Fail log:
=============================================================== BUG: kernel NULL pointer dereference, address: 0000000000000088 ... RIP: 0010:_raw_spin_lock+0x7e/0xd0 ... Call Trace: ax25_disconnect+0xf6/0x220 ax25_device_event+0x187/0x250 raw_notifier_call_chain+0x5e/0x70 dev_close_many+0x17d/0x230 rollback_registered_many+0x1f1/0x950 unregister_netdevice_queue+0x133/0x200 unregister_netdev+0x13/0x20 ...
Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: David S. Miller davem@davemloft.net [OP: backport to 5.10: adjust context] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Conflict: net/ax25/af_ax25.c Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index e5f6838a235a..4100aec3017d 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -98,8 +98,8 @@ static void ax25_kill_by_device(struct net_device *dev) spin_unlock_bh(&ax25_list_lock); lock_sock(sk); s->ax25_dev = NULL; - release_sock(sk); ax25_disconnect(s, ENETUNREACH); + release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); /* The entry could have been deleted from the
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from linux-5.10.112 commit f934fa478dd17411bc6884153dc824ff9e7505d8 category: bugfix bugzilla: 186561 https://gitee.com/src-openeuler/kernel/issues/I53VJO CVE: CVE-2022-1205
--------------------------------
commit fc6d01ff9ef03b66d4a3a23b46fc3c3d8cf92009 upstream.
The previous commit 7ec02f5ac8a5 ("ax25: fix NPD bug in ax25_disconnect") move ax25_disconnect into lock_sock() in order to prevent NPD bugs. But there are race conditions that may lead to null pointer dereferences in ax25_heartbeat_expiry(), ax25_t1timer_expiry(), ax25_t2timer_expiry(), ax25_t3timer_expiry() and ax25_idletimer_expiry(), when we use ax25_kill_by_device() to detach the ax25 device.
One of the race conditions that cause null pointer dereferences can be shown as below:
(Thread 1) | (Thread 2) ax25_connect() | ax25_std_establish_data_link() | ax25_start_t1timer() | mod_timer(&ax25->t1timer,..) | | ax25_kill_by_device() (wait a time) | ... | s->ax25_dev = NULL; //(1) ax25_t1timer_expiry() | ax25->ax25_dev->values[..] //(2)| ... ... |
We set null to ax25_cb->ax25_dev in position (1) and dereference the null pointer in position (2).
The corresponding fail log is shown below:
=============================================================== BUG: kernel NULL pointer dereference, address: 0000000000000050 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.17.0-rc6-00794-g45690b7d0 RIP: 0010:ax25_t1timer_expiry+0x12/0x40 ... Call Trace: call_timer_fn+0x21/0x120 __run_timers.part.0+0x1ca/0x250 run_timer_softirq+0x2c/0x60 __do_softirq+0xef/0x2f3 irq_exit_rcu+0xb6/0x100 sysvec_apic_timer_interrupt+0xa2/0xd0 ...
This patch moves ax25_disconnect() before s->ax25_dev = NULL and uses del_timer_sync() to delete timers in ax25_disconnect(). If ax25_disconnect() is called by ax25_kill_by_device() or ax25->ax25_dev is NULL, the reason in ax25_disconnect() will be equal to ENETUNREACH, it will wait all timers to stop before we set null to s->ax25_dev in ax25_kill_by_device().
Fixes: 7ec02f5ac8a5 ("ax25: fix NPD bug in ax25_disconnect") Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: David S. Miller davem@davemloft.net [OP: backport to 5.10: adjust context] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Conflict: net/ax25/af_ax25.c Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 4 ++-- net/ax25/ax25_subr.c | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 4100aec3017d..85deb8bae0bb 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -89,16 +89,16 @@ static void ax25_kill_by_device(struct net_device *dev) sk = s->sk; if (!sk) { spin_unlock_bh(&ax25_list_lock); - s->ax25_dev = NULL; ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; spin_lock_bh(&ax25_list_lock); goto again; } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk); - s->ax25_dev = NULL; ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 15ab812c4fe4..3a476e4f6cd0 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -261,12 +261,20 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25);
- if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) - ax25_stop_heartbeat(ax25); - ax25_stop_t1timer(ax25); - ax25_stop_t2timer(ax25); - ax25_stop_t3timer(ax25); - ax25_stop_idletimer(ax25); + if (reason == ENETUNREACH) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); + } else { + if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) + ax25_stop_heartbeat(ax25); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + }
ax25->state = AX25_STATE_0;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: features bugzilla: https://gitee.com/openeuler/kernel/issues/I4UQ08 CVE: NA
--------------------------------
The sysctl_overload_detect_period indicates the maximum time that an offline task can be preempted by online tasks. Currently, this minimum is 1s, it is too long for the vcpu thread in VM. So set its minimum to 100ms.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89ef0c1a1642..7daa5b045a17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2699,7 +2699,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one_thousand, + .extra1 = &one_hundred, .extra2 = &hundred_thousand, }, {
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: 186414, https://gitee.com/openeuler/kernel/issues/I53YXV CVE: NA
--------------------------------
Patch 1919867e8bad advanced the allocation of kfence_pool to setup_arch(). Since the macro module_param_cb is parsed after setup_arch(), it's invalid to set sample_interval and num_objects in cmdline. Add macro early_param to parse the cmdline to make it effective before the allocation of kfence_pool.
Fixes: 1919867e8bad ("arm64: remove page granularity limitation from KFENCE") Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/kfence/core.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+)
diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 02bd7b468162..fcc79594020c 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -86,6 +86,19 @@ static const struct kernel_param_ops sample_interval_param_ops = { }; module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+#ifdef CONFIG_ARM64 +static int __init parse_sample_interval(char *str) +{ + unsigned long num; + + if (kstrtoul(str, 0, &num) < 0) + return 0; + kfence_sample_interval = num; + return 0; +} +early_param("kfence.sample_interval", parse_sample_interval); +#endif + /* Pool usage% threshold when currently covered allocations are skipped. */ static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); @@ -139,6 +152,21 @@ static const struct kernel_param_ops num_objects_param_ops = { .get = param_get_num_objects, }; module_param_cb(num_objects, &num_objects_param_ops, &kfence_num_objects, 0600); + +#ifdef CONFIG_ARM64 +static int __init parse_num_objects(char *str) +{ + unsigned long num; + + if (kstrtoul(str, 0, &num) < 0) + return 0; + if (num < MIN_KFENCE_OBJECTS || num > MAX_KFENCE_OBJECTS) + return 0; + kfence_num_objects = num; + return 0; +} +early_param("kfence.num_objects", parse_num_objects); +#endif #endif
/*
From: Kefeng Wang wangkefeng.wang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545FF CVE: NA
--------------------------------
The user wants to reserve a certain amount of memory for normal non-huge page, that is, the hugetlb can't allowed to use all the memory.
Add a new kernel parameters "hugepage_prohibit_sz=" to set size for normal non-huge page reserved, and when alloc huge page, let's fail if the new allocating exceeds the limit.
Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 7 ++ mm/hugetlb.c | 73 +++++++++++++++++++ 2 files changed, 80 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 74c25228aec4..ad50e4ebef68 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1564,6 +1564,13 @@ hugepages using the cma allocator. If enabled, the boot-time allocation of gigantic hugepages is skipped.
+ hugepage_prohibit_sz= + [HW] HugeTLB pages should not alloc when the rest of + the normal pages less than hugepage_prohibit_sz. This + setting is to make sure a system can start even when + part of physical memory is broken, admin users can + adjust this according to typical environment. + hugepages= [HW] Number of HugeTLB pages to allocate at boot. If this follows hugepagesz (below), it specifies the number of pages of hugepagesz to be allocated. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 34f3dfba5e82..a8c815386ecc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1784,6 +1784,33 @@ pgoff_t hugetlb_basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; }
+#define HUGE_PAGE_BOOTMEM_ALLOC 0 +#define HUGE_PAGE_FRESH_ALLOC 1 + +static u64 normal_page_reserve_sz; + +static int __init early_normal_page_reserve(char *p) +{ + unsigned long long size; + + if (!p) + return 1; + + size = memparse(p, &p); + if (*p) { + pr_warn("HugeTLB: Invalid normal page reserved size\n"); + return 1; + } + + normal_page_reserve_sz = size & PAGE_MASK; + + pr_info("HugeTLB: Normal page reserved %lldMB\n", + normal_page_reserve_sz >> 20); + + return 0; +} +early_param("hugepage_prohibit_sz", early_normal_page_reserve); + static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -1831,6 +1858,45 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, return page; }
+static bool __ref huge_page_limit_check(int type, size_t hsize, int nid) +{ + u64 mem_usable = 0; + char *str = NULL; + char buf[32]; + + if (!normal_page_reserve_sz) + return true; + + if (system_state > SYSTEM_SCHEDULING) + return true; + + if (normal_page_reserve_sz >= memblock_phys_mem_size()) { + mem_usable = memblock_phys_mem_size(); + str = "physical memory"; + goto out; + } + + if (type == HUGE_PAGE_BOOTMEM_ALLOC) { + mem_usable = memblock_phys_mem_size() - memblock_reserved_size(); + str = "memblock usable"; + } else if (type == HUGE_PAGE_FRESH_ALLOC) { + mem_usable = nr_free_pages() << PAGE_SHIFT; + str = "free page"; + } + + if (mem_usable < normal_page_reserve_sz + hsize) + goto out; + + return true; +out: + string_get_size(hsize, 1, STRING_UNITS_2, buf, 32); + pr_info("HugeTLB: allocating(%s) + Normal pages reserved(%lldMB) node%d exceed %s size(%lldMB)\n", + buf, normal_page_reserve_sz >> 20, + nid, str, mem_usable >> 20); + + return false; +} + /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages @@ -1843,6 +1909,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, bool retry = false;
retry: + if (!huge_page_limit_check(HUGE_PAGE_FRESH_ALLOC, huge_page_size(h), nid)) + return NULL; + if (hstate_is_gigantic(h)) page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else @@ -2637,6 +2706,10 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
if (nid != NUMA_NO_NODE && nid >= nr_online_nodes) return 0; + + if (!huge_page_limit_check(HUGE_PAGE_BOOTMEM_ALLOC, huge_page_size(h), nid)) + return 0; + /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),