From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6AXGS CVE: NA
--------------------------------
syzbot is reporting GFP_KERNEL allocation with oom_lock held when reporting memcg OOM [1]. If this allocation triggers the global OOM situation then the system can livelock because the GFP_KERNEL allocation with oom_lock held cannot trigger the global OOM killer because __alloc_pages_may_oom() fails to hold oom_lock.
The problem mentioned above has been fixed by patch[2]. The is the same problem in memcg_memfs_info feature too. Refer to the patch[2], fix it by removing the allocation from mem_cgroup_print_memfs_info() completely, and pass static buffer when calling from memcg OOM path.
Link: https://syzkaller.appspot.com/bug?extid=2d2aeadc6ce1e1f11d45 [1] Link: https://lkml.kernel.org/r/86afb39f-8c65-bec2-6cfc-c5e3cd600c0b@I-love.SAKURA... [2] Fixes: 6b1d4d3a3713 ("mm/memcg_memfs_info: show files that having pages charged in mem_cgroup") Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/memcg_memfs_info.h | 4 +++- mm/memcg_memfs_info.c | 20 ++++++++++---------- mm/memcontrol.c | 3 ++- 3 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/include/linux/memcg_memfs_info.h b/include/linux/memcg_memfs_info.h index 658a91e22bd7..b5e3709baa9e 100644 --- a/include/linux/memcg_memfs_info.h +++ b/include/linux/memcg_memfs_info.h @@ -6,11 +6,13 @@ #include <linux/seq_file.h>
#ifdef CONFIG_MEMCG_MEMFS_INFO -void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m); +void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, char *pathbuf, + struct seq_file *m); int mem_cgroup_memfs_files_show(struct seq_file *m, void *v); void mem_cgroup_memfs_info_init(void); #else static inline void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, + char *pathbuf, struct seq_file *m) { } diff --git a/mm/memcg_memfs_info.c b/mm/memcg_memfs_info.c index 346175026cae..632e03da673b 100644 --- a/mm/memcg_memfs_info.c +++ b/mm/memcg_memfs_info.c @@ -162,7 +162,8 @@ static void memfs_show_files_in_mem_cgroup(struct super_block *sb, void *data) mntput(pfc->vfsmnt); }
-void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m) +void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, char *pathbuf, + struct seq_file *m) { struct print_files_control pfc = { .memcg = memcg, @@ -170,17 +171,11 @@ void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m) .max_print_files = memfs_max_print_files, .size_threshold = memfs_size_threshold, }; - char *pathbuf; int i;
if (!memfs_enable || !memcg) return;
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!pathbuf) { - SEQ_printf(m, "Show memfs failed due to OOM\n"); - return; - } pfc.pathbuf = pathbuf; pfc.pathbuf_size = PATH_MAX;
@@ -197,15 +192,20 @@ void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m) SEQ_printf(m, "total files: %lu, total memory-size: %lukB\n", pfc.total_print_files, pfc.total_files_size >> 10); } - - kfree(pfc.pathbuf); }
int mem_cgroup_memfs_files_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + char *pathbuf;
- mem_cgroup_print_memfs_info(memcg, m); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf) { + SEQ_printf(m, "Show memfs abort: failed to allocate memory\n"); + return 0; + } + mem_cgroup_print_memfs_info(memcg, pathbuf, m); + kfree(pathbuf); return 0; }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bdc90e6fc082..fd40fef49e45 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1490,6 +1490,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { struct mem_cgroup *iter; unsigned int i; + static char pathbuf[PATH_MAX];
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), @@ -1522,7 +1523,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_cont("\n"); }
- mem_cgroup_print_memfs_info(memcg, NULL); + mem_cgroup_print_memfs_info(memcg, pathbuf, NULL); }
/*
From: Wang Wensheng wangwensheng4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6FK2R CVE: NA
-------------------------------
This feature is not actually used and introduce a list double added problem. Just delete its source.
------------[ cut here ]------------ list_add double add: new=ffff20000cdd1780, prev=ffff20000cdd1780, next=ffff20000cd1f300. WARNING: CPU: 1 PID: 31515 at lib/list_debug.c:35 __list_add_valid+0x124/0x158 lib/list_debug.c:33 Modules linked in: CPU: 1 PID: 31515 Comm: syz-executor.2 Not tainted 4.19.90 #1 Hardware name: linux,dummy-virt (DT) pstate: 80400005 (Nzcv daif +PAN -UAO) pc : __list_add_valid+0x124/0x158 lib/list_debug.c:33 lr : __list_add_valid+0x124/0x158 lib/list_debug.c:33 ... Call trace: __list_add_valid+0x124/0x158 lib/list_debug.c:33 __list_add include/linux/list.h:60 [inline] list_add_tail include/linux/list.h:93 [inline] register_shrinker_prepared+0x4c/0x130 mm/vmscan.c:420 register_shrinker+0x38/0x50 mm/vmscan.c:431 hugepage_tuning_enable+0x60/0x360 mm/hugepage_tuning.c:558 hp_enable_store+0x88/0x108 mm/hugepage_tuning.c:460 hugepage_tuning_attr_store+0x68/0x98 mm/hugepage_tuning.c:402 sysfs_kf_write+0x114/0x190 fs/sysfs/file.c:139 kernfs_fop_write+0x264/0x4b8 fs/kernfs/file.c:316 __vfs_write+0xf4/0x5a0 fs/read_write.c:487 vfs_write+0x144/0x400 fs/read_write.c:551 ksys_write+0xf4/0x238 fs/read_write.c:601 __do_sys_write fs/read_write.c:613 [inline] __se_sys_write fs/read_write.c:610 [inline] __arm64_sys_write+0x74/0xa8 fs/read_write.c:610 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall arch/arm64/kernel/syscall.c:48 [inline] el0_svc_common+0x134/0x570 arch/arm64/kernel/syscall.c:121 el0_svc_handler+0x190/0x260 arch/arm64/kernel/syscall.c:190 el0_svc+0x10/0x640 arch/arm64/kernel/entry.S:1028 ---[ end trace 328ad58f62232ded ]---
Revert "arm64/ascend: Add auto tuning hugepage module" This reverts commit ecec54f47fce64563561fda3df88d01d3bf5713f.
Revert "arm64/ascend: Add hugepage flags change interface" This reverts commit db1d159bacd4156b965d8158bc32dfde29353c52.
Revert "arm64/ascend: Add set hugepage number helper function" This reverts commit b6bcd500561c908d757caaa39ee719a63cf1ada0.
Revert "arm64/ascend: Add mmap hook when alloc hugepage" This reverts commit d9952490423c6361b5257b506746e0e46319314e.
Revert "arm64/ascend: Add new CONFIG for auto-tuning hugepage" This reverts commit 2597ada28ffe244c98c605311ce3069742dddcf0.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- arch/arm64/Kconfig | 9 - mm/Makefile | 1 - mm/hugepage_tuning.c | 693 ------------------------------------------- mm/hugepage_tuning.h | 70 ----- mm/hugetlb.c | 22 -- mm/mmap.c | 29 -- 6 files changed, 824 deletions(-) delete mode 100644 mm/hugepage_tuning.c delete mode 100644 mm/hugepage_tuning.h
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 003e333ad864..e24792913bfb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1486,15 +1486,6 @@ config ASCEND_WATCHDOG_SYSFS_CONFIGURE watchdog. The kernel thread could be notified so it's ok to make that change when the watchdog is pinged by kernel thread.
-config ASCEND_AUTO_TUNING_HUGEPAGE - bool "Enable support for the auto-tuning hugepage" - depends on HUGETLBFS - depends on MEMCG - default y - help - The hugepage auto-tuning means the kernel dynamically manages the number of - huage pages. To achieve this purpose, custom interfaces are required. - config ASCEND_SHARE_POOL bool "Enable support for the Share Pool Memory" default n diff --git a/mm/Makefile b/mm/Makefile index 38291476ce22..7f19e97ce466 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -106,7 +106,6 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o -obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o diff --git a/mm/hugepage_tuning.c b/mm/hugepage_tuning.c deleted file mode 100644 index 0ce0c3a46ae0..000000000000 --- a/mm/hugepage_tuning.c +++ /dev/null @@ -1,693 +0,0 @@ -/* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. All rights reserved. - * Author: Huawei OS Kernel Lab - * Create: Fri Jan 11 10:45:12 2019 - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/kthread.h> -#include <linux/freezer.h> -#include <linux/fs.h> -#include <linux/stat.h> -#include <linux/buffer_head.h> -#include <linux/hugetlb.h> -#include <linux/mm.h> -#include <linux/cgroup.h> -#include <linux/memcontrol.h> -#include <linux/sched/mm.h> -#include <asm/segment.h> -#include <linux/uaccess.h> -#include "hugepage_tuning.h" - -MODULE_LICENSE("GPL"); -MODULE_VERSION("0.01"); - -/* config huge page number */ -/* We had a hard limit: 50% * total memory */ -static unsigned long config_hugepage_nr; -module_param(config_hugepage_nr, ulong, 0644); - -/* max hugepages. ratio from 1-50 */ -static unsigned int config_ratio = 50; -module_param(config_ratio, int, 0644); - -/* config memcgroup */ -static char *config_memcgroup = "usermemory"; -module_param(config_memcgroup, charp, 0644); - -/* cooldown time */ -static unsigned int config_cooldown_time = 60; -module_param(config_cooldown_time, int, 0644); - -/* cpu mask */ -static unsigned int config_cpu_mask; -module_param(config_cpu_mask, int, 0644); - -/* auto drop cache */ -static unsigned int config_drop_cache = 1; -module_param(config_drop_cache, int, 0644); - -/* auto compat */ -static unsigned int config_mem_compat; -module_param(config_mem_compat, int, 0644); - -static struct shrinker huge_tuning_shrinker = { - .count_objects = hugepage_tuning_shrink, - .scan_objects = hugepage_tuning_scan, - .seeks = DEFAULT_SEEKS, -}; - -/* pointer to hugepage status */ -static const struct hstate *hs; -/* pointer to hugepage tuning sysfs node */ -static struct kobject *hp_sysfs_node; - -/* kernel hugepage tuning main worker thrad */ -static struct task_struct *khptuning_thread __read_mostly; - -/* used to wakeup */ -static int notify_flag; -static int cooldown_time; -static DECLARE_WAIT_QUEUE_HEAD(khptuning_wait); -static DEFINE_MUTEX(tuning_lock); -static struct hugepage_tuning hp; - -static char buff[BUFF_LEN]; -/* this function used to write sys file. */ -int sysctl_write_file(char *path, int nr) -{ - struct file *filp = NULL; - int err; - mm_segment_t oldfs; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - filp = filp_open(path, O_WRONLY, 0200); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); - pr_info("hugepage_tuning: open file %s failed err %d\n", - path, err); - return err; - } - memset(buff, 0, sizeof(buff)); - sprintf(buff, "%d\n", nr); - err = filp->f_op->write(filp, buff, sizeof(buff), &filp->f_pos); - if (err < 0) { - pr_err("hugepage_tuning: write file %s faild err %d]n", - path, err); - } - - set_fs(oldfs); - filp_close(filp, NULL); - return err; -} - -static struct kernfs_open_file *kernfs_of(struct file *file) -{ - return ((struct seq_file *)file->private_data)->private; -} - -/* get memory cgroup from /sys/fs/cgroup */ -struct mem_cgroup *get_mem_cgroup_from_path(void) -{ - struct file *filp = NULL; - int err; - char path[PATH_LEN]; - struct kernfs_open_file *of; - struct mem_cgroup *mcg; - - strreplace(config_memcgroup, '\n', '\0'); - snprintf(path, sizeof(path), MEMCGR, config_memcgroup); - filp = filp_open(path, O_WRONLY, 0200); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); - pr_info("hugepage_tuning: open file %s failed err %d\n", - path, err); - return NULL; - } - of = kernfs_of(filp); - mcg = mem_cgroup_from_css(of_css(of)); - filp_close(filp, NULL); - - return mcg; -} -/* - * This function call sysctl_set_hugepage to increase or reduce the total num - * of hugepage. -nr: total hugepage nr -ret val: diff of total hugepage nr - */ -s64 sysctl_set_hugepage(u64 nr) -{ - int err; - u64 total = hs->nr_huge_pages; - - if (total == nr) { - /* nothing todo */ - err = 0; - goto out; - } - /* call sysctrl to change hugepage num */ - err = hugetlb_sysctl_store(nr); - if (err < 0) - goto out; - - /* return diff nr */ - err = hs->nr_huge_pages - total; -out: - return err; -} -/* shrink as soon as possible */ -unsigned long hugepage_tuning_shrink(struct shrinker *s, - struct shrink_control *sc) -{ - int free_nr = 0; - - /* do not shrink when the tuning thread is hot, wait 10 seconds */ - if (!time_after(jiffies, hp.adjust_time + 10 * HZ) || hp.hot) - return 0; - - /* free 10% * free huge page */ - if (hs->free_huge_pages > hp.mmap_last) { - /* reserve at least one mmap step */ - free_nr = (hs->free_huge_pages - hp.mmap_last) / 10; - } - - if (free_nr > 0) { - /* free hugepage, no need to count */ - sysctl_set_hugepage(hs->nr_huge_pages - free_nr); - hp.shrink_count += free_nr; - } - - return free_nr; -} - -unsigned long hugepage_tuning_scan(struct shrinker *s, - struct shrink_control *sc) -{ - /* just retuern 0 */ - return 0; -} - -static int mmap_notifier(struct notifier_block *self, unsigned long arg1, - void *arg2) -{ - u64 nr = arg1 / (2 * SIZE_MB); - - /* record max step tied by [MMAP_MIN, MMAP_MAX]*/ - if (nr > MMAP_MAX) - hp.mmap_last = MMAP_MAX; - else if (nr > MMAP_MIN) - hp.mmap_last = nr; - else - hp.mmap_last = MMAP_MIN; - - /* if there's not enough free huge page */ - if (nr > hs->free_huge_pages) { - /* wakeup cool worker to alloc more hugepage */ - if (!cooldown_time) { - hp.mmap_fail++; - /* don't bother a hot worker */ - notify_flag = 1; - wake_up(&khptuning_wait); - } else { - /* the worker is hot, just ignore */ - hp.mmap_fail_hot++; - } - } else { - /* nice try */ - hp.mmap_succ++; - } - return 0; -} -static struct notifier_block mmap_handle = { - .notifier_call = mmap_notifier -}; - -static int oom_notifier(struct notifier_block *self, - unsigned long arg1, void *arg2) -{ - *(unsigned long *)arg2 = hugepage_tuning_shrink(NULL, NULL); - return 0; -} -static struct notifier_block oom_handle = { - .notifier_call = oom_notifier -}; - -static void hugepage_tuning_shake(struct hugepage_tuning *hp) -{ - int err; - /* there's enough memory, but fragmentization */ - /* drop cache and compact_memory and retry */ - if (config_drop_cache) { - err = sysctl_write_file(PATH_DROP, 3); - if (!err) - pr_info("hugepage_tuning: do drop cache!\n"); - - cooldown_time = config_cooldown_time; - hp->stat_drop_compat++; - } - if (config_mem_compat) { - err = sysctl_write_file(PATH_COMPAT, 1); - if (!err) - pr_info("hugepage_tuning: do memory compat!\n"); - - cooldown_time = config_cooldown_time * 10; - hp->stat_drop_compat++; - } -} -/* - * main worker thread - * drop cache and compat memory are hard work, we should prevent the - * shaking by cooldown_time - */ -static int khptuningd(void *none) -{ - struct mem_cgroup *memcg; - u64 last_miss; - u64 want = 0; - u64 available; - u64 step_nr; - s64 num; - struct sysinfo i; - u64 system_free; - - set_freezable(); - set_user_nice(current, MAX_NICE); - last_miss = hp.mmap_fail; - - /* setup memcgroup */ - memcg = get_mem_cgroup_from_path(); - if (!memcg) { - pr_err("hugepage_tuning: can't find memcgroup [%s]\n", - config_memcgroup); - khptuning_thread = NULL; - return -EINVAL; - } - memalloc_use_memcg(memcg); - - /* create huge page */ - hp.hot = 1; - sysctl_set_hugepage(hp.init_nr); - hp.hot = 0; - - /* check if we should stop */ - while (!kthread_should_stop()) { - /* 1st. each cycle we count 'available' and system free */ - hp.hot = 1; - available = hp.max_nr > hs->nr_huge_pages ? - hp.max_nr - hs->nr_huge_pages : 0; - /* system memory threadhold */ - si_meminfo(&i); - system_free = (i.freeram + i.bufferram) * 4 * SIZE_KB / - (2 * SIZE_MB); - - /* 2nd. mmap_fail more than last_miss means in the last cycle - * there's new mmap fail occur, so we need more page. - */ - if (hp.mmap_fail > last_miss) { - /* max step, should not bigger than MMAP_MAX */ - step_nr = hp.mmap_last > MMAP_MAX ? - MMAP_MAX : hp.mmap_last; - want = (hp.mmap_fail - last_miss) * step_nr; - } - - /* 3rd. now we hava available, wanted, free. only free < want - * < available + free we can create new huge page - */ - if (want > 0 && want <= (available + hs->free_huge_pages)) { - if (want < (system_free / 2)) { - num = sysctl_set_hugepage(hs->nr_huge_pages + want); - hp.adjust_count += num; - hp.adjust_time = jiffies; - } else { - num = 0; - hp.adjust_fail++; - } - - if (num + hs->free_huge_pages >= want) { - /* very good, there's enough memory */ - last_miss = hp.mmap_fail; - } else { - /* do drop cache and compat when there's at - * least 1GB memory - */ - if (cooldown_time == 0 && system_free > 500) { - hugepage_tuning_shake(&hp); - } else { - /* very bad. retry fail. */ - last_miss = hp.mmap_fail; - hp.adjust_fail++; - } - } - } else { - /* there' no work to do or no enough memory: - * 1. mmap is too large, more than hs->free_huge_pages. - * 2. reach the max_nr limit - * just update stat miss - */ - last_miss = hp.mmap_fail; - if (want > 0) - hp.adjust_fail++; - } - - /* cycle done, reset all vals */ - if (notify_flag == 0 && cooldown_time > 0) { - /* only timeout process can reduce cooldown time */ - cooldown_time--; - } - want = 0; - notify_flag = 0; - hp.stat_wake++; - hp.hot = 0; - /* start cycle every second or wake up by notify_flag */ - wait_event_timeout(khptuning_wait, (notify_flag == 1), 10 * HZ); - } - - memalloc_unuse_memcg(); - mem_cgroup_put(memcg); - return 0; -} - -/* unregister sysfs */ -static void hp_sysfs_release(struct kobject *kobj) -{ - kfree(kobj); -} - -static ssize_t hugepage_tuning_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct kobj_attribute *kattr; - ssize_t ret = -EIO; - - kattr = container_of(attr, struct kobj_attribute, attr); - if (kattr->show) - ret = kattr->show(kobj, kattr, buf); - return ret; -} - -static ssize_t hugepage_tuning_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t count) -{ - struct kobj_attribute *kattr; - ssize_t ret = -EIO; - - kattr = container_of(attr, struct kobj_attribute, attr); - if (kattr->store) - ret = kattr->store(kobj, kattr, buf, count); - return ret; -} - -static const struct sysfs_ops hp_sysfs_ops = { - .show = hugepage_tuning_attr_show, - .store = hugepage_tuning_attr_store, -}; - -static ssize_t hp_stat_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "initnr: %lld\n" - "maxnr: %lld\n" - "ratio: %d\n" - "huge_nr: %ld\n" - "free_nr: %ld\n" - "mmap_last: %lld\n" - "mmap_succ: %lld\n" - "mmap_fail: %lld\n" - "mmap_fail_hot: %lld\n" - "shrink_count: %lld\n" - "wake: %lld\n" - "adjust_count: %lld\n" - "adjust_fail: %lld\n" - "drop_compat: %lld\n", - hp.init_nr, hp.max_nr, hp.ratio, hs->nr_huge_pages, - hs->free_huge_pages, - hp.mmap_last, hp.mmap_succ, hp.mmap_fail, - hp.mmap_fail_hot, - hp.shrink_count, - hp.stat_wake, hp.adjust_count, hp.adjust_fail, - hp.stat_drop_compat); -} - -static ssize_t hp_stat_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - return -EACCES; -} - -static struct kobj_attribute hugepage_tuning_attr_name = -__ATTR(status, 0444, hp_stat_show, hp_stat_store); - -static char hp_enable[BUFF_LEN] = "0\n"; - -static ssize_t hp_enable_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, hp_enable); -} - -static ssize_t hp_enable_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - int err = 0; - - if (strcmp(buf, "1\n") == 0) { - err = hugepage_tuning_enable(); - if (err < 0) - return err; - hp_enable[0] = '1'; - } else if (strcmp(buf, "0\n") == 0) { - hugepage_tuning_disable(); - hp_enable[0] = '0'; - } else { - pr_err("hugepage_tuning: invalid val to enable: %s(len %ld)\n", - buf, len); - return -EINVAL; - } - return len; -} - -static struct kobj_attribute hugepage_tuning_attr_string = -__ATTR(enable, 0644, hp_enable_show, hp_enable_store); - -static struct attribute *default_attr[] = { - &hugepage_tuning_attr_name.attr, - &hugepage_tuning_attr_string.attr, - NULL, -}; - -static struct kobj_type hp_sysfs_type = { - .sysfs_ops = &hp_sysfs_ops, - .release = hp_sysfs_release, - .default_attrs = default_attr -}; - -/* config hugepage tuning thread */ -int hugepage_tuning_config(void) -{ - int err = 0; - struct sysinfo i; - u64 half; - - memset(&hp, 0, sizeof(hp)); - - /* 1st. we use ratio to config max nr */ - hp.ratio = config_ratio; - si_meminfo(&i); - half = (i.totalram * 50 * 4 * SIZE_KB) / (100 * SIZE_MB * 2); - - if (hp.ratio >= 0 && hp.ratio <= 50) { - /* get sys meminfo, scale KB */ - hp.max_nr = (i.totalram * hp.ratio * 4 * SIZE_KB) / - (100 * SIZE_MB * 2); - - } else { - pr_info("hugepage_tuning: invalid ratio (%d), should in [0,50]\n", hp.ratio); - err = -EINVAL; - goto out; - } - - /* 2nd. config_hugepage_nr */ - if (config_hugepage_nr > half) { - pr_info("hugepage_tuning: invalid config_hugepage_nr (%ld), should less than half total memory.\n", config_hugepage_nr); - err = -EINVAL; - goto out; - } - if (config_hugepage_nr > 0) - hp.max_nr = config_hugepage_nr; - - /* 3rd. config init nr not more than 25% * total for compatibility */ - if (config_ratio > 25) - hp.init_nr = (i.totalram * 25 * 4 * SIZE_KB) / (100 * SIZE_MB * 2); - - hp.init_nr = hp.init_nr < hp.max_nr ? hp.init_nr : hp.max_nr; - - /* 4th. left */ - hp.mmap_last = MMAP_MIN; //default to MMAP_MIN to prevent too small step - -out: - return err; -} - -/* create tuning thread and enable worker */ -int hugepage_tuning_enable(void) -{ - int err = 0; - - /* lock */ - mutex_lock(&tuning_lock); - - if (khptuning_thread) { - /* dup enable */ - pr_info("hugepage_tuning: hugepage tuning dup enable!\n"); - err = -EINVAL; - goto out; - } - - /* 1st. config tuning's hugepage nr */ - err = hugepage_tuning_config(); - if (err < 0) - goto fail; - - /* 2nd. register shrinker */ - err = register_shrinker(&huge_tuning_shrinker); - if (err < 0) { - pr_info("hugepage_tuning: register shrinker failed! err = %d\n", err); - goto fail; - } - - /* 3rd. register mmap notifier */ - err = register_mmap_notifier(&mmap_handle); - if (err < 0) { - /* roll back register */ - unregister_shrinker(&huge_tuning_shrinker); - - pr_info("hugepage_tuning: register mmap handle failed! err = %d\n", err); - goto fail; - } - /* 3rd. register mmap notifier */ - err = register_hisi_oom_notifier(&oom_handle); - if (err < 0) { - /* roll back register */ - unregister_shrinker(&huge_tuning_shrinker); - unregister_mmap_notifier(&mmap_handle); - - pr_info("hugepage_tuning: register oom handle failed! err = %d\n", err); - goto fail; - } - /* 4th. create and start thread */ - khptuning_thread = kthread_run(khptuningd, NULL, "khptuningd"); - if (IS_ERR(khptuning_thread)) { - /* roll back register */ - unregister_shrinker(&huge_tuning_shrinker); - unregister_mmap_notifier(&mmap_handle); - unregister_hisi_oom_notifier(&oom_handle); - - err = PTR_ERR(khptuning_thread); - khptuning_thread = NULL; - pr_info("hugepage_tuning: kthread_run(khugepaged) failed err = %d\n", err); - goto fail; - } - /* default bind to cpu 0 */ - err = set_cpus_allowed_ptr(khptuning_thread, cpumask_of(config_cpu_mask)); - if (err < 0) { - /* roll back register */ - unregister_shrinker(&huge_tuning_shrinker); - unregister_mmap_notifier(&mmap_handle); - unregister_hisi_oom_notifier(&oom_handle); - /* stop thread */ - kthread_stop(khptuning_thread); - khptuning_thread = NULL; - - pr_err("Failed to set affinity to 0x%x CPU\n", config_cpu_mask); - goto fail; - } - - hugepage_gfp_mask = __GFP_ACCOUNT; - mmap_notifier_enable = 1; - /* unlock */ - mutex_unlock(&tuning_lock); - return 0; -fail: - /* reset all hugepage */ - sysctl_set_hugepage(0); -out: - /* unlock */ - mutex_unlock(&tuning_lock); - return err; -} - -/* disable worker and destroy tuning thread */ -void hugepage_tuning_disable(void) -{ - /* lock */ - mutex_lock(&tuning_lock); - - /* 1nd. unregister */ - unregister_shrinker(&huge_tuning_shrinker); - unregister_mmap_notifier(&mmap_handle); - unregister_hisi_oom_notifier(&oom_handle); - - /* 2nd. stop thread */ - if (khptuning_thread) { - kthread_stop(khptuning_thread); - khptuning_thread = NULL; - } - - /* 3nd. free all hugepage */ - sysctl_set_hugepage(0); - - /* reset */ - hugepage_gfp_mask = 0; - mmap_notifier_enable = 0; - - /* unlock */ - mutex_unlock(&tuning_lock); -} - -/* module init */ -static int __init hugepage_tuning_init(void) -{ - int err = 0; - - /* clean */ - memset(&hp, 0, sizeof(hp)); - - /* global get hstate once */ - hs = hugetlb_get_hstate(); - - /* sysfs create */ - hp_sysfs_node = kzalloc(sizeof(*hp_sysfs_node), GFP_KERNEL); - if (!hp_sysfs_node) { - pr_err("hugepage_tuning: alloc hp_sysfs_node faile!\n"); - return -EINVAL; - } - - err = kobject_init_and_add(hp_sysfs_node, &hp_sysfs_type, - NULL, "hugepage_tuning"); - if (err) { - pr_err("hugepage_tuning: add hp_sysfs_node faile! err = %d.\n", err); - /* free the mem */ - kobject_put(hp_sysfs_node); - return -EINVAL; - } - return 0; -} - -/* module exit */ -static void __exit hugepage_tuning_exit(void) -{ - /* disable tuning thread */ - hugepage_tuning_disable(); - - /* unlink kobject from hierarchy */ - kobject_del(hp_sysfs_node); -} - -module_init(hugepage_tuning_init); -module_exit(hugepage_tuning_exit); diff --git a/mm/hugepage_tuning.h b/mm/hugepage_tuning.h deleted file mode 100644 index b772e7e5cc63..000000000000 --- a/mm/hugepage_tuning.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. All rights reserved. - * Author: Huawei OS Kernel Lab - * Create: Fri Jan 11 10:45:12 2019 - */ -#ifndef __MM_HUGEPAGE_TUNING_H__ -#define __MM_HUGEPAGE_TUNING_H__ - -#define BUFF_LEN (32) -#define PATH_LEN (128) -#define SIZE_MB (1024 * 1024) -#define SIZE_KB (1024) -#define MMAP_MAX (100) -#define MMAP_MIN (10) -#define PATH_DROP "/proc/sys/vm/drop_caches" -#define PATH_COMPAT "/proc/sys/vm/compact_memory" -#define MEMCGR "/sys/fs/cgroup/memory/%s/memory.limit_in_bytes" - -/* extern funcs */ -extern int register_mmap_notifier(struct notifier_block *nb); -extern int unregister_mmap_notifier(struct notifier_block *nb); -extern int hugetlb_sysctl_store(size_t length); -extern int register_hisi_oom_notifier(struct notifier_block *nb); -extern int unregister_hisi_oom_notifier(struct notifier_block *nb); -extern gfp_t hugepage_gfp_mask; -extern int mmap_notifier_enable; - -/* base funcs */ -int hugepage_tuning_config(void); -int hugepage_tuning_enable(void); -void hugepage_tuning_disable(void); - -/* for shrink */ -unsigned long hugepage_tuning_shrink(struct shrinker *s, - struct shrink_control *sc); -unsigned long hugepage_tuning_scan(struct shrinker *s, - struct shrink_control *sc); - -/* hugepage tuning control main struct */ -struct hugepage_tuning { - /* for compatibility, initnr is 25% sys mem*/ - u64 init_nr; - /* max hugepage num, set by user */ - u64 max_nr; - /* max hugepage num(ratio), set by user */ - int ratio; - /* last mmap len */ - u64 mmap_last; - /* mmap count */ - u64 mmap_succ; - /* mmap fail */ - u64 mmap_fail; - /* these misses will be ignore */ - u64 mmap_fail_hot; - /* wake */ - u64 stat_wake; - /* adjust huge page nr fail */ - u64 adjust_fail; - /* adjust huge page nr count */ - u64 adjust_count; - /* adjust time */ - unsigned long adjust_time; - /* shrink hugepage number count */ - u64 shrink_count; - /* drop and compat count */ - u64 stat_drop_compat; - /* hot flag */ - int hot; -}; -#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4f61d3be0b11..ed89df6fc5de 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1683,10 +1683,6 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, return page; }
-#ifdef CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE -gfp_t hugepage_gfp_mask = 0; -EXPORT_SYMBOL(hugepage_gfp_mask); -#endif /* * Allocates a fresh page to the hugetlb allocator pool in the node interleaved * manner. @@ -1696,12 +1692,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, { struct page *page; int nr_nodes, node; -#ifdef CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE - gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE | - hugepage_gfp_mask; -#else gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; -#endif
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, @@ -4493,19 +4484,6 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, buffer, length, ppos); }
-#ifdef CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE -int hugetlb_sysctl_store(size_t length) -{ - int ret; - struct hstate *h = &default_hstate; - - ret = __nr_hugepages_store_common(false, h, NUMA_NO_NODE, length, - length); - return ret; -} -EXPORT_SYMBOL(hugetlb_sysctl_store); -#endif - #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) diff --git a/mm/mmap.c b/mm/mmap.c index 0576cad8ccb6..f5b137f5f681 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1355,30 +1355,6 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; }
-#ifdef CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE -int mmap_notifier_enable = 0; -EXPORT_SYMBOL_GPL(mmap_notifier_enable); - -static BLOCKING_NOTIFIER_HEAD(mmap_notify_list); - -int register_mmap_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&mmap_notify_list, nb); -} -EXPORT_SYMBOL_GPL(register_mmap_notifier); - -int mmap_notifier_call(unsigned long val, void *v) -{ - return blocking_notifier_call_chain(&mmap_notify_list, val, v); -} - -int unregister_mmap_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&mmap_notify_list, nb); -} -EXPORT_SYMBOL_GPL(unregister_mmap_notifier); -#endif - static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, @@ -1431,11 +1407,6 @@ __do_mmap(struct mm_struct *mm, struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM;
-#ifdef CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE - /* only notify flags with MAP_HUGETLB */ - if (flags & MAP_HUGETLB && mmap_notifier_enable) - mmap_notifier_call(len, NULL); -#endif /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */
From: Wang Wensheng wangwensheng4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6FK2R CVE: NA
-------------------------------
This feature has been deleted.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 - arch/arm64/configs/syzkaller_defconfig | 1 - 2 files changed, 2 deletions(-)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index c6ea4e1a3946..06dfc89b644f 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -484,7 +484,6 @@ CONFIG_ASCEND_OOM=y CONFIG_ASCEND_IOPF_HIPRI=y CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y CONFIG_ASCEND_WATCHDOG_SYSFS_CONFIGURE=y -CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE=y CONFIG_ASCEND_SHARE_POOL=y CONFIG_ASCEND_CLEAN_CDM=y
diff --git a/arch/arm64/configs/syzkaller_defconfig b/arch/arm64/configs/syzkaller_defconfig index d82959f4027c..6d1eca7cca3a 100644 --- a/arch/arm64/configs/syzkaller_defconfig +++ b/arch/arm64/configs/syzkaller_defconfig @@ -473,7 +473,6 @@ CONFIG_ASCEND_OOM=y CONFIG_ASCEND_IOPF_HIPRI=y CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y # CONFIG_ASCEND_WATCHDOG_SYSFS_CONFIGURE is not set -CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE=y CONFIG_ASCEND_SHARE_POOL=y
#
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6FCQZ CVE: NA
--------------------------------
When dynamic hugetlb is enabled, the hpool should be NULL for cont-bit hugepage, set it.
Fixes: f15774c66bcd ("dhugetlb: only support 1G/2M hugepage and ARM64_4K_PAGES") Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Nanyong Sun sunnanyong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/hugetlbfs/inode.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d70c8aaf5acc..014ee6533e2e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1183,6 +1183,8 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) if (huge_page_size(sbinfo->hstate) == PMD_SIZE || huge_page_size(sbinfo->hstate) == PUD_SIZE) p->hpool = get_dhugetlb_pool_from_task(current); + else + p->hpool = NULL; #endif
return &p->vfs_inode;
From: Chen Zhongjin chenzhongjin@huawei.com
hulk inclusion category: bugfix bugzilla: 187818, https://gitee.com/openeuler/kernel/issues/I6DK3O CVE: NA
--------------------------------
check_paravirt() calls orc_find() before its implementation code.
If CONFIG_DYNAMIC_FTRACE is enabled, orc_find() will be declared earlier and compiling will not fail. Otherwise it will fail for "implicit declaration of function 'orc_find'".
Move declaration of orc_find() out of CONFIG_DYNAMIC_FTRACE macro to fix this.
Fixes: fecb933c06b8 ("x86/unwind: Fix orc entry for paravirt {save,restore}_fl") Signed-off-by: Chen Zhongjin chenzhongjin@huawei.com Reviewed-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Xu Kuohai xukuohai@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- arch/x86/kernel/unwind_orc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index ec4a14e4f639..d609a7fa08c6 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -79,9 +79,9 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif
-#ifdef CONFIG_DYNAMIC_FTRACE static struct orc_entry *orc_find(unsigned long ip);
+#ifdef CONFIG_DYNAMIC_FTRACE /* * Ftrace dynamic trampolines do not have orc entries of their own. * But they are copies of the ftrace entries that are static and