From: Guo Mengqi guomengqi3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4MUV2 CVE: NA
when k2u is being executed ont the whole sharepool group, and one process coredumps, k2u will skip the coredumped process and continue on the rest processes in the group.
Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 50 +++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 494a829d6f3a..f18bcd188027 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -666,8 +666,25 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate, unsigned long prot); static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); + +#define K2U_NORMAL 0 +#define K2U_COREDUMP 1 + +struct sp_k2u_context { + unsigned long kva; + unsigned long kva_aligned; + unsigned long size; + unsigned long size_aligned; + unsigned long sp_flags; + int state; + int spg_id; + bool to_task; + struct timespec64 start; + struct timespec64 end; +}; + static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, - struct mm_struct *mm, unsigned long prot); + struct mm_struct *mm, unsigned long prot, struct sp_k2u_context *kc);
static void free_sp_group_id(int spg_id) { @@ -1313,7 +1330,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) spin_unlock(&sp_area_lock);
if (spa->type == SPA_TYPE_K2SPG && spa->kva) { - addr = sp_remap_kva_to_vma(spa->kva, spa, mm, prot); + addr = sp_remap_kva_to_vma(spa->kva, spa, mm, prot, NULL); if (IS_ERR_VALUE(addr)) pr_warn("add group remap k2u failed %ld\n", addr);
@@ -2586,7 +2603,7 @@ static unsigned long __sp_remap_get_pfn(unsigned long kva)
/* when called by k2u to group, always make sure rw_lock of spg is down */ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, - struct mm_struct *mm, unsigned long prot) + struct mm_struct *mm, unsigned long prot, struct sp_k2u_context *kc) { struct vm_area_struct *vma; unsigned long ret_addr; @@ -2598,6 +2615,8 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (unlikely(mm->core_state)) { pr_err("k2u mmap: encountered coredump, abort\n"); ret_addr = -EBUSY; + if (kc) + kc->state = K2U_COREDUMP; goto put_mm; }
@@ -2683,7 +2702,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un
spa->kva = kva;
- uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot); + uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, NULL); __sp_area_drop(spa); if (IS_ERR(uva)) pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva)); @@ -2711,6 +2730,8 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, unsigned long size, struct mm_struct *mm; struct sp_group_node *spg_node; void *uva = ERR_PTR(-ENODEV); + struct sp_k2u_context kc; + unsigned long ret_addr = -ENODEV;
down_read(&spg->rw_lock); spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2SPG, current->tgid); @@ -2725,12 +2746,17 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, unsigned long size,
list_for_each_entry(spg_node, &spg->procs, proc_node) { mm = spg_node->master->mm; - uva = (void *)sp_remap_kva_to_vma(kva, spa, mm, spg_node->prot); - if (IS_ERR(uva)) { + kc.state = K2U_NORMAL; + ret_addr = sp_remap_kva_to_vma(kva, spa, mm, spg_node->prot, &kc); + if (IS_ERR_VALUE(ret_addr)) { + if (kc.state == K2U_COREDUMP) + continue; + uva = (void *)ret_addr; pr_err("remap k2u to spg failed %ld\n", PTR_ERR(uva)); __sp_free(spg, spa->va_start, spa_size(spa), mm); goto out; } + uva = (void *)ret_addr; }
out: @@ -2755,18 +2781,6 @@ static bool vmalloc_area_set_flag(unsigned long kva, unsigned long flags) return false; }
-struct sp_k2u_context { - unsigned long kva; - unsigned long kva_aligned; - unsigned long size; - unsigned long size_aligned; - unsigned long sp_flags; - int spg_id; - bool to_task; - struct timespec64 start; - struct timespec64 end; -}; - static void trace_sp_k2u_begin(struct sp_k2u_context *kc) { if (!sysctl_sp_perf_k2u)
From: Guo Mengqi guomengqi3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4MUV2 CVE: NA
when sp_alloc is being executed ont the whole sharepool group, and one process coredumps, allocation will skip the coredumped process and continue on the rest processes in the group.
Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index f18bcd188027..b209e216b33a 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2144,6 +2144,7 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, #define ALLOC_NORMAL 1 #define ALLOC_RETRY 2 #define ALLOC_NOMEM 3 +#define ALLOC_COREDUMP 4
struct sp_alloc_context { struct sp_group *spg; @@ -2321,8 +2322,7 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, down_write(&mm->mmap_lock); if (unlikely(mm->core_state)) { up_write(&mm->mmap_lock); - sp_alloc_unmap(mm, spa, spg_node); - ac->state = ALLOC_NOMEM; + ac->state = ALLOC_COREDUMP; pr_info("allocation encountered coredump\n"); return -EFAULT; } @@ -2464,7 +2464,8 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, static int sp_alloc_mmap_populate(struct sp_area *spa, struct sp_alloc_context *ac) { - int ret; + int ret = -EINVAL; + int mmap_ret = 0; struct mm_struct *mm; struct sp_group_node *spg_node;
@@ -2474,9 +2475,19 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, /* create mapping for each process in the group */ list_for_each_entry(spg_node, &spa->spg->procs, proc_node) { mm = spg_node->master->mm; - ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac); - if (ret) - return ret; + mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac); + if (mmap_ret) { + if (ac->state != ALLOC_COREDUMP) + return mmap_ret; + if (ac->spg == spg_none) { + sp_alloc_unmap(mm, spa, spg_node); + pr_err("dvpp allocation failed due to coredump"); + return mmap_ret; + } + ac->state = ALLOC_NORMAL; + continue; + } + ret = mmap_ret; } } return ret;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4M23J CVE: NA
-------------------------------------------------
Don't use the spa's va_start if the spa is not valid.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index b209e216b33a..3a37418378f6 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2508,13 +2508,14 @@ static void sp_alloc_finish(int result, struct sp_area *spa, sp_update_process_stat(current, true, spa);
/* this will free spa if mmap failed */ - if (spa && !IS_ERR(spa)) + if (spa && !IS_ERR(spa)) { __sp_area_drop(spa); + trace_sp_alloc_finish(ac, spa->va_start); + }
if (!is_pass_through) sp_group_drop(spg);
- trace_sp_alloc_finish(ac, spa->va_start); sp_dump_stack(); sp_try_to_compact(); }
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 186182, https://gitee.com/openeuler/kernel/issues/I4UOJI CVE: NA
--------------------------------
Support to print rootfs files and tmpfs files that having pages charged in given memory cgroup. The files infomations can be printed through interface "memory.memfs_files_info" or printed when OOM is triggered.
In order not to flush memory logs, we limit the maximum number of files to be printed when oom through interface "max_print_files_in_oom". And in order to filter out small files, we limit the minimum size of files that can be printed through interface "size_threshold".
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/vm/memcg_memfs_info.rst | 40 ++++ include/linux/memcg_memfs_info.h | 21 ++ init/Kconfig | 10 + mm/Makefile | 1 + mm/memcg_memfs_info.c | 316 ++++++++++++++++++++++++++ mm/memcontrol.c | 11 + 6 files changed, 399 insertions(+) create mode 100644 Documentation/vm/memcg_memfs_info.rst create mode 100644 include/linux/memcg_memfs_info.h create mode 100644 mm/memcg_memfs_info.c
diff --git a/Documentation/vm/memcg_memfs_info.rst b/Documentation/vm/memcg_memfs_info.rst new file mode 100644 index 000000000000..aff432d125e5 --- /dev/null +++ b/Documentation/vm/memcg_memfs_info.rst @@ -0,0 +1,40 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +================ +Memcg Memfs Info +================ + +Overview +======== + +Support to print rootfs files and tmpfs files that having pages charged +in given memory cgroup. The files infomations can be printed through +interface "memory.memfs_files_info" or printed when OOM is triggered. + +User control +============ + +1. /sys/kernel/mm/memcg_memfs_info/enable +----------------------------------------- + +Boolean type. The default value is 0, set it to 1 to enable the feature. + +2. /sys/kernel/mm/memcg_memfs_info/max_print_files_in_oom +--------------------------------------------------------- + +Unsigned long type. The default value is 500, indicating that the maximum of +files can be print to console when OOM is triggered. + +3. /sys/kernel/mm/memcg_memfs_info/size_threshold +------------------------------------------------- + +Unsigned long type. The default value is 0, indicating that the minimum size of +files that can be printed. + +4. /sys/fs/cgroup/memory/<memory>/memory.memfs_files_info +--------------------------------------------------------- + +Outputs the files who use memory in this memory cgroup. + +--- +Liu Shixin, Jan 2022 diff --git a/include/linux/memcg_memfs_info.h b/include/linux/memcg_memfs_info.h new file mode 100644 index 000000000000..658a91e22bd7 --- /dev/null +++ b/include/linux/memcg_memfs_info.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_MEMCG_MEMFS_INFO_H +#define _LINUX_MEMCG_MEMFS_INFO_H + +#include <linux/memcontrol.h> +#include <linux/seq_file.h> + +#ifdef CONFIG_MEMCG_MEMFS_INFO +void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m); +int mem_cgroup_memfs_files_show(struct seq_file *m, void *v); +void mem_cgroup_memfs_info_init(void); +#else +static inline void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, + struct seq_file *m) +{ +} +static inline void mem_cgroup_memfs_info_init(void) +{ +} +#endif +#endif diff --git a/init/Kconfig b/init/Kconfig index 17533f1f19d4..895e0ef85f73 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -913,6 +913,16 @@ config MEMCG_KMEM depends on MEMCG && !SLOB default y
+config MEMCG_MEMFS_INFO + bool "Show memfs files that have pages charged in given memory cgroup" + depends on MEMCG + default n + help + Support to print rootfs files and tmpfs files that having pages + charged in given memory cgroup. The files infomations can be printed + through interface "memory.memfs_files_info" or printed when OOM is + triggered. + config BLK_CGROUP bool "IO controller" depends on BLOCK diff --git a/mm/Makefile b/mm/Makefile index 4b3a827429f3..d2a6a786f915 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -129,3 +129,4 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_SHRINK_PAGECACHE) += page_cache_limit.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o +obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o diff --git a/mm/memcg_memfs_info.c b/mm/memcg_memfs_info.c new file mode 100644 index 000000000000..f404367ad08c --- /dev/null +++ b/mm/memcg_memfs_info.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include <linux/memcg_memfs_info.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/kobject.h> +#include <linux/slab.h> +#include "../fs/mount.h" + +#define SEQ_printf(m, x...) \ +do { \ + if (m) \ + seq_printf(m, x); \ + else \ + pr_info(x); \ +} while (0) + +struct print_files_control { + struct mem_cgroup *memcg; + struct seq_file *m; + unsigned long size_threshold; + unsigned long max_print_files; + + char *pathbuf; + unsigned long pathbuf_size; + + const char *fs_type_name; + struct vfsmount *vfsmnt; + unsigned long total_print_files; + unsigned long total_files_size; +}; + +static bool memfs_enable; +static unsigned long memfs_size_threshold; +static unsigned long memfs_max_print_files = 500; + +static const char *const fs_type_names[] = { + "rootfs", + "tmpfs", +}; + +static struct vfsmount *memfs_get_vfsmount(struct super_block *sb) +{ + struct mount *mnt; + struct vfsmount *vfsmnt; + + lock_mount_hash(); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + /* + * There may be multiple mount points for a super_block, + * just need to print one of these mount points to determine + * the file path. + */ + vfsmnt = mntget(&mnt->mnt); + unlock_mount_hash(); + return vfsmnt; + } + unlock_mount_hash(); + + return NULL; +} + +static unsigned long memfs_count_in_mem_cgroup(struct mem_cgroup *memcg, + struct address_space *mapping) +{ + XA_STATE(xas, &mapping->i_pages, 0); + unsigned long size = 0; + struct page *page, *head; + + rcu_read_lock(); + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) + continue; + + if (xa_is_value(page)) + continue; + + head = compound_head(page); + if ((unsigned long)memcg == head->memcg_data) + size += PAGE_SIZE; + } + rcu_read_unlock(); + return size; +} + +static void memfs_show_file_in_mem_cgroup(void *data, struct inode *inode) +{ + struct print_files_control *pfc = data; + struct dentry *dentry; + unsigned long size; + struct path path; + char *filepath; + + size = memfs_count_in_mem_cgroup(pfc->memcg, inode->i_mapping); + if (!size || size < pfc->size_threshold) + return; + + dentry = d_find_alias(inode); + if (!dentry) + return; + path.mnt = pfc->vfsmnt; + path.dentry = dentry; + filepath = d_absolute_path(&path, pfc->pathbuf, pfc->pathbuf_size); + if (!filepath || IS_ERR(filepath)) + filepath = "(too long)"; + pfc->total_print_files++; + pfc->total_files_size += size; + dput(dentry); + + /* + * To prevent excessive logs, limit the amount of data + * that can be output to logs. + */ + if (!pfc->m && pfc->total_print_files > pfc->max_print_files) + return; + + SEQ_printf(pfc->m, "%lukB %llukB %s\n", + size >> 10, inode->i_size >> 10, filepath); +} + +static void memfs_show_files_in_mem_cgroup(struct super_block *sb, void *data) +{ + struct print_files_control *pfc = data; + struct inode *inode, *toput_inode = NULL; + + if (strncmp(sb->s_type->name, + pfc->fs_type_name, strlen(pfc->fs_type_name))) + return; + + pfc->vfsmnt = memfs_get_vfsmount(sb); + if (!pfc->vfsmnt) + return; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0 && !need_resched())) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + memfs_show_file_in_mem_cgroup(pfc, inode); + + iput(toput_inode); + toput_inode = inode; + + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + iput(toput_inode); + mntput(pfc->vfsmnt); +} + +void mem_cgroup_print_memfs_info(struct mem_cgroup *memcg, struct seq_file *m) +{ + struct print_files_control pfc = { + .memcg = memcg, + .m = m, + .max_print_files = memfs_max_print_files, + .size_threshold = memfs_size_threshold, + }; + char *pathbuf; + int i; + + if (!memfs_enable || !memcg) + return; + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf) { + SEQ_printf(m, "Show memfs failed due to OOM\n"); + return; + } + pfc.pathbuf = pathbuf; + pfc.pathbuf_size = PATH_MAX; + + for (i = 0; i < ARRAY_SIZE(fs_type_names); i++) { + pfc.fs_type_name = fs_type_names[i]; + pfc.total_print_files = 0; + pfc.total_files_size = 0; + + SEQ_printf(m, "Show %s files (memory-size > %lukB):\n", + pfc.fs_type_name, pfc.size_threshold >> 10); + SEQ_printf(m, "<memory-size> <file-size> <path>\n"); + iterate_supers(memfs_show_files_in_mem_cgroup, &pfc); + + SEQ_printf(m, "total files: %lu, total memory-size: %lukB\n", + pfc.total_print_files, pfc.total_files_size >> 10); + } + + kfree(pfc.pathbuf); +} + +int mem_cgroup_memfs_files_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + mem_cgroup_print_memfs_info(memcg, m); + return 0; +} + +static ssize_t memfs_size_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", memfs_size_threshold); +} + +static ssize_t memfs_size_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned long count; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + memfs_size_threshold = count; + return len; +} + +static struct kobj_attribute memfs_size_threshold_attr = { + .attr = {"size_threshold", 0644}, + .show = &memfs_size_threshold_show, + .store = &memfs_size_threshold_store, +}; + +static ssize_t memfs_max_print_files_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", memfs_max_print_files); +} + +static ssize_t memfs_max_print_files_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned long count; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + memfs_max_print_files = count; + return len; +} + +static struct kobj_attribute memfs_max_print_files_attr = { + .attr = {"max_print_files_in_oom", 0644}, + .show = &memfs_max_print_files_show, + .store = &memfs_max_print_files_store, +}; + +static ssize_t memfs_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", memfs_enable); +} + +static ssize_t memfs_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + bool enable; + int err; + + err = kstrtobool(buf, &enable); + if (err) + return err; + + memfs_enable = enable; + return len; +} + +static struct kobj_attribute memfs_enable_attr = { + .attr = {"enable", 0644}, + .show = &memfs_enable_show, + .store = &memfs_enable_store, +}; + +static struct attribute *memfs_attr[] = { + &memfs_size_threshold_attr.attr, + &memfs_max_print_files_attr.attr, + &memfs_enable_attr.attr, + NULL, +}; + +static struct attribute_group memfs_attr_group = { + .attrs = memfs_attr, +}; + +void mem_cgroup_memfs_info_init(void) +{ + struct kobject *memcg_memfs_kobj; + + if (mem_cgroup_disabled()) + return; + + memcg_memfs_kobj = kobject_create_and_add("memcg_memfs_info", mm_kobj); + if (unlikely(!memcg_memfs_kobj)) { + pr_err("failed to create memcg_memfs kobject\n"); + return; + } + + if (sysfs_create_group(memcg_memfs_kobj, &memfs_attr_group)) { + pr_err("failed to register memcg_memfs group\n"); + kobject_put(memcg_memfs_kobj); + } +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fad3d4dd88ec..daed900a666e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -62,6 +62,7 @@ #include <linux/tracehook.h> #include <linux/psi.h> #include <linux/seq_buf.h> +#include <linux/memcg_memfs_info.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -1625,6 +1626,8 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) return; pr_info("%s", buf); kfree(buf); + + mem_cgroup_print_memfs_info(memcg, NULL); }
/* @@ -5219,6 +5222,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, }, #endif +#ifdef CONFIG_MEMCG_MEMFS_INFO + { + .name = "memfs_files_info", + .seq_show = mem_cgroup_memfs_files_show, + }, +#endif #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -7358,6 +7367,8 @@ static int __init mem_cgroup_init(void) soft_limit_tree.rb_tree_per_node[node] = rtpn; }
+ mem_cgroup_memfs_info_init(); + return 0; } subsys_initcall(mem_cgroup_init);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 186182, https://gitee.com/openeuler/kernel/issues/I4UOJI CVE: NA
--------------------------------
enable CONFIG_MEMCG_MEMFS_INFO by default.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 5020f94eea34..770222a597e4 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -133,6 +133,7 @@ CONFIG_PAGE_COUNTER=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y CONFIG_MEMCG_KMEM=y +CONFIG_MEMCG_MEMFS_INFO=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 351a89b54d7f..926dfe0628dc 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -152,6 +152,7 @@ CONFIG_PAGE_COUNTER=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y CONFIG_MEMCG_KMEM=y +CONFIG_MEMCG_MEMFS_INFO=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y
From: Piotr Figiel figiel@google.com
mainline inclusion from mainline-5.13-rc1 commit 90f093fa8ea48e5d991332cee160b761423d55c1 category: feature feature: Userspace percpu bugzilla: https://gitee.com/openeuler/kernel/issues/I4W2BQ CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
For userspace checkpoint and restore (C/R) a way of getting process state containing RSEQ configuration is needed.
There are two ways this information is going to be used: - to re-enable RSEQ for threads which had it enabled before C/R - to detect if a thread was in a critical section during C/R
Since C/R preserves TLS memory and addresses RSEQ ABI will be restored using the address registered before C/R.
Detection whether the thread is in a critical section during C/R is needed to enforce behavior of RSEQ abort during C/R. Attaching with ptrace() before registers are dumped itself doesn't cause RSEQ abort. Restoring the instruction pointer within the critical section is problematic because rseq_cs may get cleared before the control is passed to the migrated application code leading to RSEQ invariants not being preserved. C/R code will use RSEQ ABI address to find the abort handler to which the instruction pointer needs to be set.
To achieve above goals expose the RSEQ ABI address and the signature value with the new ptrace request PTRACE_GET_RSEQ_CONFIGURATION.
This new ptrace request can also be used by debuggers so they are aware of stops within restartable sequences in progress.
Signed-off-by: Piotr Figiel figiel@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Reviewed-by: Michal Miroslaw emmir@google.com Reviewed-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com Acked-by: Oleg Nesterov oleg@redhat.com Link: https://lkml.kernel.org/r/20210226135156.1081606-1-figiel@google.com Signed-off-by: Yunfeng Ye yeyunfeng@huawei.com Reviewed-by: Chao Liu liuchao173@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/uapi/linux/ptrace.h | 10 ++++++++++ kernel/ptrace.c | 25 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+)
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index 83ee45fa634b..3747bf816f9a 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -102,6 +102,16 @@ struct ptrace_syscall_info { }; };
+#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f + +struct ptrace_rseq_configuration { + __u64 rseq_abi_pointer; + __u32 rseq_abi_size; + __u32 signature; + __u32 flags; + __u32 pad; +}; + /* * These values are stored in task->ptrace_message * by tracehook_report_syscall_* to describe the current syscall-stop. diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0087ce50d99e..e3210358bcd2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -31,6 +31,7 @@ #include <linux/cn_proc.h> #include <linux/compat.h> #include <linux/sched/signal.h> +#include <linux/minmax.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -795,6 +796,24 @@ static int ptrace_peek_siginfo(struct task_struct *child, return ret; }
+#ifdef CONFIG_RSEQ +static long ptrace_get_rseq_configuration(struct task_struct *task, + unsigned long size, void __user *data) +{ + struct ptrace_rseq_configuration conf = { + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, + .rseq_abi_size = sizeof(*task->rseq), + .signature = task->rseq_sig, + .flags = 0, + }; + + size = min_t(unsigned long, size, sizeof(conf)); + if (copy_to_user(data, &conf, size)) + return -EFAULT; + return sizeof(conf); +} +#endif + #ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) #else @@ -1243,6 +1262,12 @@ int ptrace_request(struct task_struct *child, long request, ret = seccomp_get_metadata(child, addr, datavp); break;
+#ifdef CONFIG_RSEQ + case PTRACE_GET_RSEQ_CONFIGURATION: + ret = ptrace_get_rseq_configuration(child, addr, datavp); + break; +#endif + default: break; }
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 185988, https://gitee.com/openeuler/kernel/issues/I4YVV3
--------------------------------
In jffs2_scan_medium, if `s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);` returns error, go to "out" to do clear. Null pointer dereference occurs when `if (s->sum_list_head)` is executed in "out".
Fixes: bf7ba557361f ("[Huawei] jffs2: fix memory leak in jffs2_scan_medium") Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/jffs2/scan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 49b0637fb36e..29671e33a171 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) if (!s) { JFFS2_WARNING("Can't allocate memory for summary\n"); ret = -ENOMEM; - goto out; + goto out_buf; } }
@@ -275,15 +275,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } ret = 0; out: + jffs2_sum_reset_collected(s); + kfree(s); + out_buf: if (buf_size) kfree(flashbuf); #ifndef __ECOS else mtd_unpoint(c->mtd, 0, c->mtd->size); #endif - if (s->sum_list_head) - jffs2_sum_reset_collected(s); - kfree(s); return ret; }
From: Kefeng Wang wangkefeng.wang@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
It's better to make the non-upstreamed feature into stand-alone file, which make us easy to backport mainline patches.
No functional changes.
Cc: Sang Yan sangyan@huawei.com Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Sang Yan sangyan@huawei.com Reviewed-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/setup.c | 10 +--- arch/arm64/mm/Makefile | 1 + arch/arm64/mm/init.c | 43 +--------------- arch/arm64/mm/{pmem_reserve.h => internal.h} | 12 +++++ arch/arm64/mm/quick_kexec.c | 53 ++++++++++++++++++++ 5 files changed, 69 insertions(+), 50 deletions(-) rename arch/arm64/mm/{pmem_reserve.h => internal.h} (52%) create mode 100644 arch/arm64/mm/quick_kexec.c
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 08198b824846..2dd3ea837d35 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -53,7 +53,7 @@ #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h>
-#include "../mm/pmem_reserve.h" +#include "../mm/internal.h"
static int num_standard_resources; static struct resource *standard_resources; @@ -291,13 +291,7 @@ static void __init request_standard_resources(void) request_resource(res, &crashk_res); #endif
-#ifdef CONFIG_QUICK_KEXEC - if (quick_kexec_res.end && - quick_kexec_res.start >= res->start && - quick_kexec_res.end <= res->end) - request_resource(res, &quick_kexec_res); -#endif - + request_quick_kexec_res(res); request_pin_mem_res(res); }
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 3634ad81bdf1..68a32305cff9 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -17,3 +17,4 @@ obj-$(CONFIG_KASAN) += kasan_init.o KASAN_SANITIZE_kasan_init.o := n
obj-$(CONFIG_ARM64_PMEM_RESERVE) += pmem_reserve.o +obj-$(CONFIG_QUICK_KEXEC) += quick_kexec.o diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5ab9dd7d55d9..90411356b8b2 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -45,7 +45,7 @@ #include <asm/tlb.h> #include <asm/alternative.h>
-#include "pmem_reserve.h" +#include "internal.h"
/* * We need to be able to catch inadvertent references to memstart_addr @@ -131,45 +131,6 @@ static void __init reserve_elfcorehdr(void) } #endif /* CONFIG_CRASH_DUMP */
-#ifdef CONFIG_QUICK_KEXEC -static int __init parse_quick_kexec(char *p) -{ - if (!p) - return 0; - - quick_kexec_res.end = PAGE_ALIGN(memparse(p, NULL)); - - return 0; -} -early_param("quickkexec", parse_quick_kexec); - -static void __init reserve_quick_kexec(void) -{ - unsigned long long mem_start, mem_len; - - mem_len = quick_kexec_res.end; - if (mem_len == 0) - return; - - /* Current arm64 boot protocol requires 2MB alignment */ - mem_start = memblock_find_in_range(0, arm64_dma_phys_limit, - mem_len, SZ_2M); - if (mem_start == 0) { - pr_warn("cannot allocate quick kexec mem (size:0x%llx)\n", - mem_len); - quick_kexec_res.end = 0; - return; - } - - memblock_reserve(mem_start, mem_len); - pr_info("quick kexec mem reserved: 0x%016llx - 0x%016llx (%lld MB)\n", - mem_start, mem_start + mem_len, mem_len >> 20); - - quick_kexec_res.start = mem_start; - quick_kexec_res.end = mem_start + mem_len - 1; -} -#endif - /* * Return the maximum physical address for a zone accessible by the given bits * limit. If DRAM starts above 32-bit, expand the zone to the maximum @@ -591,9 +552,7 @@ void __init bootmem_init(void) */ reserve_crashkernel();
-#ifdef CONFIG_QUICK_KEXEC reserve_quick_kexec(); -#endif
reserve_pmem();
diff --git a/arch/arm64/mm/pmem_reserve.h b/arch/arm64/mm/internal.h similarity index 52% rename from arch/arm64/mm/pmem_reserve.h rename to arch/arm64/mm/internal.h index d143198c9696..e1c6fc36b3b5 100644 --- a/arch/arm64/mm/pmem_reserve.h +++ b/arch/arm64/mm/internal.h @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARM64_MM_INTERNAL_H +#define __ARM64_MM_INTERNAL_H + #include <linux/types.h>
#ifdef CONFIG_ARM64_PMEM_RESERVE @@ -11,3 +14,12 @@ static inline void __init setup_reserve_pmem(u64 start, u64 size) {} static inline void __init reserve_pmem(void) {} static inline void __init request_pmem_res_resource(void) {} #endif +#ifdef CONFIG_ARM64_PMEM_RESERVE +void __init reserve_quick_kexec(void); +void __init request_quick_kexec_res(struct resource *res); +#else +static inline void __init reserve_quick_kexec(void) {} +static inline void __init request_quick_kexec_res(struct resource *res) {} +#endif + +#endif /* ifndef _ARM64_MM_INTERNAL_H */ diff --git a/arch/arm64/mm/quick_kexec.c b/arch/arm64/mm/quick_kexec.c new file mode 100644 index 000000000000..fb68346f45a9 --- /dev/null +++ b/arch/arm64/mm/quick_kexec.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "quick_kexec: " fmt + +#include <linux/memblock.h> +#include <linux/ioport.h> +#include <linux/types.h> +#include <linux/kexec.h> + +static int __init parse_quick_kexec(char *p) +{ + if (!p) + return 0; + + quick_kexec_res.end = PAGE_ALIGN(memparse(p, NULL)); + + return 0; +} +early_param("quickkexec", parse_quick_kexec); + +void __init reserve_quick_kexec(void) +{ + unsigned long long mem_start, mem_len; + + mem_len = quick_kexec_res.end; + if (mem_len == 0) + return; + + /* Current arm64 boot protocol requires 2MB alignment */ + mem_start = memblock_find_in_range(0, arm64_dma_phys_limit, + mem_len, SZ_2M); + if (mem_start == 0) { + pr_warn("cannot allocate quick kexec mem (size:0x%llx)\n", + mem_len); + quick_kexec_res.end = 0; + return; + } + + memblock_reserve(mem_start, mem_len); + pr_info("quick kexec mem reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + mem_start, mem_start + mem_len, mem_len >> 20); + + quick_kexec_res.start = mem_start; + quick_kexec_res.end = mem_start + mem_len - 1; +} + +void __init request_quick_kexec_res(struct resource *res) +{ + if (quick_kexec_res.end && + quick_kexec_res.start >= res->start && + quick_kexec_res.end <= res->end) + request_resource(res, &quick_kexec_res); +}
From: Kefeng Wang wangkefeng.wang@huawei.com
hulk inclusion category: feature bugzilla: 48159 CVE: N/A
------------------------------
It's better to make the non-upstreamed feature into stand-alone file, which make us easy to backport mainline patches.
No functional changes.
Cc: Sang Yan sangyan@huawei.com Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Sang Yan sangyan@huawei.com Reviewed-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/cpu_park.h | 29 +++ arch/arm64/include/asm/kexec.h | 5 - arch/arm64/include/asm/smp.h | 16 +- arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/arm64_cpu_park.c | 289 +++++++++++++++++++++++++++++ arch/arm64/kernel/cpu-park.S | 2 +- arch/arm64/kernel/process.c | 3 +- arch/arm64/kernel/smp.c | 226 +--------------------- arch/arm64/mm/init.c | 54 +----- 9 files changed, 328 insertions(+), 298 deletions(-) create mode 100644 arch/arm64/include/asm/cpu_park.h create mode 100644 arch/arm64/kernel/arm64_cpu_park.c
diff --git a/arch/arm64/include/asm/cpu_park.h b/arch/arm64/include/asm/cpu_park.h new file mode 100644 index 000000000000..0aa4ebf6f830 --- /dev/null +++ b/arch/arm64/include/asm/cpu_park.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_CPU_PARK_H +#define __ASM_CPU_PARK_H + +#ifdef CONFIG_ARM64_CPU_PARK + +/* CPU park state flag: "park" */ +#define PARK_MAGIC 0x7061726b + +#ifndef __ASSEMBLY__ +extern void enter_cpu_park(unsigned long text, unsigned long exit); +extern void do_cpu_park(unsigned long exit); +extern void reserve_park_mem(void); +extern int write_park_exit(unsigned int cpu); +extern int uninstall_cpu_park(unsigned int cpu); +extern void cpu_park_stop(void); +extern int kexec_smp_send_park(void); +#endif /* ifndef __ASSEMBLY__ */ + +#else +static inline void reserve_park_mem(void) {} +static inline int write_park_exit(unsigned int cpu) { return -EINVAL; } +static inline int uninstall_cpu_park(unsigned int cpu) { return -EINVAL; } +static inline void cpu_park_stop(void) {} +static inline int kexec_smp_send_park(void) { return -EINVAL; } +#endif + +#endif /* ifndef __ASM_CPU_PARK_H */ diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 20bee23b6503..e19c0af3b53d 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -32,11 +32,6 @@
#define CRASH_ADDR_HIGH_MAX MEMBLOCK_ALLOC_ACCESSIBLE
-#ifdef CONFIG_ARM64_CPU_PARK -/* CPU park state flag: "park" */ -#define PARK_MAGIC 0x7061726b -#endif - #ifndef __ASSEMBLY__
/** diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 8c5d2d650b8a..f4b19b8f323a 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -145,21 +145,7 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void); extern bool smp_crash_stop_failed(void); -#ifdef CONFIG_ARM64_CPU_PARK -#define PARK_SECTION_SIZE 1024 -struct cpu_park_info { - /* Physical address of reserved park memory. */ - unsigned long start; - /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */ - unsigned long len; - /* Virtual address of reserved park memory. */ - unsigned long start_v; -}; -extern struct cpu_park_info park_info; -extern void enter_cpu_park(unsigned long text, unsigned long exit); -extern void do_cpu_park(unsigned long exit); -extern int kexec_smp_send_park(void); -#endif +extern void smp_cross_send_stop(cpumask_t *cpumask);
#endif /* ifndef __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 169d90f11cf5..4cf75b247461 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -56,7 +56,7 @@ obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ cpu-reset.o -obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o +obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o arm64_cpu_park.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o diff --git a/arch/arm64/kernel/arm64_cpu_park.c b/arch/arm64/kernel/arm64_cpu_park.c new file mode 100644 index 000000000000..c54ffa26a2c7 --- /dev/null +++ b/arch/arm64/kernel/arm64_cpu_park.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "arm64 cpu-park: " fmt + +#include <linux/arm_sdei.h> +#include <linux/delay.h> +#include <linux/kexec.h> +#include <linux/memblock.h> +#include <linux/ioport.h> +#include <linux/io.h> +#include <linux/types.h> + +#include <asm/cacheflush.h> +#include <asm/cpu_ops.h> +#include <asm/cpu_park.h> +#include <asm/mmu_context.h> + +#define PARK_SECTION_SIZE 1024 + +struct cpu_park_section { + unsigned long exit; /* exit address of park look */ + unsigned long magic; /* maigc represent park state */ + char text[0]; /* text section of park */ +}; + +struct cpu_park_info { + /* Physical address of reserved park memory. */ + unsigned long start; + /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */ + unsigned long len; + /* Virtual address of reserved park memory. */ + unsigned long start_v; +}; + +static struct cpu_park_info park_info = { + .start = 0, + .len = PARK_SECTION_SIZE * NR_CPUS, + .start_v = 0, +}; + +static int __init parse_park_mem(char *p) +{ + if (!p) + return 0; + + park_info.start = PAGE_ALIGN(memparse(p, NULL)); + if (park_info.start == 0) + pr_info("cpu park mem params[%s]", p); + + return 0; +} +early_param("cpuparkmem", parse_park_mem); + +void __init reserve_park_mem(void) +{ + if (park_info.start == 0 || park_info.len == 0) + return; + + park_info.start = PAGE_ALIGN(park_info.start); + park_info.len = PAGE_ALIGN(park_info.len); + + if (!memblock_is_region_memory(park_info.start, park_info.len)) { + pr_warn("region is not memory!"); + goto out; + } + + if (memblock_is_region_reserved(park_info.start, park_info.len)) { + pr_warn("region overlaps reserved memory!"); + goto out; + } + + memblock_remove(park_info.start, park_info.len); + pr_info("mem reserved: 0x%016lx - 0x%016lx (%ld MB)", + park_info.start, park_info.start + park_info.len, + park_info.len >> 20); + + return; +out: + park_info.start = 0; + park_info.len = 0; + return; +} + +static int mmap_cpu_park_mem(void) +{ + if (!park_info.start) + return -ENOMEM; + + if (park_info.start_v) + return 0; + + park_info.start_v = (unsigned long)__ioremap(park_info.start, + park_info.len, + PAGE_KERNEL_EXEC); + if (!park_info.start_v) { + pr_warn("map park memory failed."); + return -ENOMEM; + } + + return 0; +} + +static inline unsigned long cpu_park_section_v(unsigned int cpu) +{ + return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1); +} + +static inline unsigned long cpu_park_section_p(unsigned int cpu) +{ + return park_info.start + PARK_SECTION_SIZE * (cpu - 1); +} + +/* + * Write the secondary_entry to exit section of park state. + * Then the secondary cpu will jump straight into the kernel + * by the secondary_entry. + */ +int write_park_exit(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_text; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_exit = &park_section->exit; + park_text = (unsigned long *)park_section->text; + pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx", + (unsigned long)park_text, *park_text, + (unsigned long)do_cpu_park, + *(unsigned long *)do_cpu_park); + + /* + * Test first 8 bytes to determine + * whether needs to write cpu park exit. + */ + if (*park_text == *(unsigned long *)do_cpu_park) { + writeq_relaxed(__pa_symbol(secondary_entry), park_exit); + __flush_dcache_area((__force void *)park_exit, + sizeof(unsigned long)); + flush_icache_range((unsigned long)park_exit, + (unsigned long)(park_exit + 1)); + sev(); + dsb(sy); + isb(); + + pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.", + cpu, *park_exit, (unsigned long)park_exit); + pr_info("Boot cpu %u from PARK state.", cpu); + return 0; + } + + return -EPERM; +} + +/* Install cpu park sections for the specific cpu. */ +static void install_cpu_park(unsigned int cpu) +{ + struct cpu_park_section *park_section; + unsigned long *park_exit; + unsigned long *park_magic; + unsigned long park_text_len; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx", + cpu, (unsigned long)park_section, + (unsigned long)(park_section->text)); + + park_exit = &park_section->exit; + park_magic = &park_section->magic; + park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section); + + *park_exit = 0UL; + *park_magic = 0UL; + memcpy((void *)park_section->text, do_cpu_park, park_text_len); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); +} + +int uninstall_cpu_park(unsigned int cpu) +{ + unsigned long park_section; + + if (mmap_cpu_park_mem() != 0) + return -EPERM; + + park_section = cpu_park_section_v(cpu); + memset((void *)park_section, 0, PARK_SECTION_SIZE); + __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); + + return 0; +} + +static int cpu_wait_park(unsigned int cpu) +{ + long timeout; + struct cpu_park_section *park_section; + + volatile unsigned long *park_magic; + + park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); + park_magic = &park_section->magic; + + timeout = USEC_PER_SEC; + while (*park_magic != PARK_MAGIC && timeout--) + udelay(1); + + if (timeout > 0) + pr_debug("cpu %u park done.", cpu); + else + pr_err("cpu %u park failed.", cpu); + + return *park_magic == PARK_MAGIC; +} + +static void cpu_park(unsigned int cpu) +{ + unsigned long park_section_p; + unsigned long park_exit_phy; + unsigned long do_park; + typeof(enter_cpu_park) *park; + + park_section_p = cpu_park_section_p(cpu); + park_exit_phy = park_section_p; + pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy); + + do_park = park_section_p + sizeof(struct cpu_park_section); + park = (void *)__pa_symbol(enter_cpu_park); + + cpu_install_idmap(); + park(do_park, park_exit_phy); + unreachable(); +} + +void cpu_park_stop(void) +{ + int cpu = smp_processor_id(); + const struct cpu_operations *ops = NULL; + /* + * Go to cpu park state. + * Otherwise go to cpu die. + */ + if (kexec_in_progress && park_info.start_v) { + machine_kexec_mask_interrupts(); + cpu_park(cpu); + + ops = get_cpu_ops(cpu); + if (ops && ops->cpu_die) + ops->cpu_die(cpu); + } +} + +int kexec_smp_send_park(void) +{ + unsigned long cpu; + + if (WARN_ON(!kexec_in_progress)) { + pr_crit("%s called not in kexec progress.", __func__); + return -EPERM; + } + + if (mmap_cpu_park_mem() != 0) { + pr_info("no cpuparkmem, goto normal way."); + return -EPERM; + } + + local_irq_disable(); + + if (num_online_cpus() > 1) { + cpumask_t mask; + + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + for_each_cpu(cpu, &mask) + install_cpu_park(cpu); + smp_cross_send_stop(&mask); + + /* Wait for other CPUs to park */ + for_each_cpu(cpu, &mask) + cpu_wait_park(cpu); + pr_info("smp park other cpus done\n"); + } + + sdei_mask_local_cpu(); + + return 0; +} diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S index 07290dabe10c..8bcfcf0dc0f5 100644 --- a/arch/arm64/kernel/cpu-park.S +++ b/arch/arm64/kernel/cpu-park.S @@ -11,7 +11,7 @@
#include <linux/linkage.h> #include <asm/assembler.h> -#include <asm/kexec.h> +#include <asm/cpu_park.h> #include <asm/sysreg.h> #include <asm/virt.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index da5ed561e962..d7c90d8e25d1 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -57,6 +57,7 @@ #include <asm/pointer_auth.h> #include <asm/stacktrace.h> #include <asm/mpam_sched.h> +#include <asm/cpu_park.h>
#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> @@ -151,10 +152,8 @@ void arch_cpu_idle_dead(void) */ void machine_shutdown(void) { -#ifdef CONFIG_ARM64_CPU_PARK if (kexec_smp_send_park() == 0) return; -#endif smp_shutdown_nonboot_cpus(reboot_cpu); }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index fc099cda70a3..dd4c76ed8ca6 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -55,6 +55,7 @@ #include <asm/tlbflush.h> #include <asm/ptrace.h> #include <asm/virt.h> +#include <asm/cpu_park.h>
#define CREATE_TRACE_POINTS #include <trace/events/ipi.h> @@ -98,167 +99,6 @@ static inline int op_cpu_kill(unsigned int cpu) } #endif
-#ifdef CONFIG_ARM64_CPU_PARK -struct cpu_park_section { - unsigned long exit; /* exit address of park look */ - unsigned long magic; /* maigc represent park state */ - char text[0]; /* text section of park */ -}; - -static int mmap_cpu_park_mem(void) -{ - if (!park_info.start) - return -ENOMEM; - - if (park_info.start_v) - return 0; - - park_info.start_v = (unsigned long)__ioremap(park_info.start, - park_info.len, - PAGE_KERNEL_EXEC); - if (!park_info.start_v) { - pr_warn("map park memory failed."); - return -ENOMEM; - } - - return 0; -} - -static inline unsigned long cpu_park_section_v(unsigned int cpu) -{ - return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1); -} - -static inline unsigned long cpu_park_section_p(unsigned int cpu) -{ - return park_info.start + PARK_SECTION_SIZE * (cpu - 1); -} - -/* - * Write the secondary_entry to exit section of park state. - * Then the secondary cpu will jump straight into the kernel - * by the secondary_entry. - */ -static int write_park_exit(unsigned int cpu) -{ - struct cpu_park_section *park_section; - unsigned long *park_exit; - unsigned long *park_text; - - if (mmap_cpu_park_mem() != 0) - return -EPERM; - - park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); - park_exit = &park_section->exit; - park_text = (unsigned long *)park_section->text; - pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx", - (unsigned long)park_text, *park_text, - (unsigned long)do_cpu_park, - *(unsigned long *)do_cpu_park); - - /* - * Test first 8 bytes to determine - * whether needs to write cpu park exit. - */ - if (*park_text == *(unsigned long *)do_cpu_park) { - writeq_relaxed(__pa_symbol(secondary_entry), park_exit); - __flush_dcache_area((__force void *)park_exit, - sizeof(unsigned long)); - flush_icache_range((unsigned long)park_exit, - (unsigned long)(park_exit + 1)); - sev(); - dsb(sy); - isb(); - - pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.", - cpu, *park_exit, (unsigned long)park_exit); - pr_info("Boot cpu %u from PARK state.", cpu); - return 0; - } - - return -EPERM; -} - -/* Install cpu park sections for the specific cpu. */ -static int install_cpu_park(unsigned int cpu) -{ - struct cpu_park_section *park_section; - unsigned long *park_exit; - unsigned long *park_magic; - unsigned long park_text_len; - - park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); - pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx", - cpu, (unsigned long)park_section, - (unsigned long)(park_section->text)); - - park_exit = &park_section->exit; - park_magic = &park_section->magic; - park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section); - - *park_exit = 0UL; - *park_magic = 0UL; - memcpy((void *)park_section->text, do_cpu_park, park_text_len); - __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); - - return 0; -} - -static int uninstall_cpu_park(unsigned int cpu) -{ - unsigned long park_section; - - if (mmap_cpu_park_mem() != 0) - return -EPERM; - - park_section = cpu_park_section_v(cpu); - memset((void *)park_section, 0, PARK_SECTION_SIZE); - __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE); - - return 0; -} - -static int cpu_wait_park(unsigned int cpu) -{ - long timeout; - struct cpu_park_section *park_section; - - volatile unsigned long *park_magic; - - park_section = (struct cpu_park_section *)cpu_park_section_v(cpu); - park_magic = &park_section->magic; - - timeout = USEC_PER_SEC; - while (*park_magic != PARK_MAGIC && timeout--) - udelay(1); - - if (timeout > 0) - pr_debug("cpu %u park done.", cpu); - else - pr_err("cpu %u park failed.", cpu); - - return *park_magic == PARK_MAGIC; -} - -static void cpu_park(unsigned int cpu) -{ - unsigned long park_section_p; - unsigned long park_exit_phy; - unsigned long do_park; - typeof(enter_cpu_park) *park; - - park_section_p = cpu_park_section_p(cpu); - park_exit_phy = park_section_p; - pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy); - - do_park = park_section_p + sizeof(struct cpu_park_section); - park = (void *)__pa_symbol(enter_cpu_park); - - cpu_install_idmap(); - park(do_park, park_exit_phy); - unreachable(); -} -#endif
/* * Boot a secondary CPU, and assign it the specified idle task. @@ -268,10 +108,8 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle) { const struct cpu_operations *ops = get_cpu_ops(cpu);
-#ifdef CONFIG_ARM64_CPU_PARK if (write_park_exit(cpu) == 0) return 0; -#endif if (ops->cpu_boot) return ops->cpu_boot(cpu);
@@ -307,9 +145,8 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) */ wait_for_completion_timeout(&cpu_running, msecs_to_jiffies(5000)); -#ifdef CONFIG_ARM64_CPU_PARK uninstall_cpu_park(cpu); -#endif + if (cpu_online(cpu)) return 0;
@@ -1057,31 +894,12 @@ void arch_irq_work_raise(void)
static void local_cpu_stop(void) { -#ifdef CONFIG_ARM64_CPU_PARK - int cpu; - const struct cpu_operations *ops = NULL; -#endif - set_cpu_online(smp_processor_id(), false);
local_daif_mask(); sdei_mask_local_cpu();
-#ifdef CONFIG_ARM64_CPU_PARK - /* - * Go to cpu park state. - * Otherwise go to cpu die. - */ - cpu = smp_processor_id(); - if (kexec_in_progress && park_info.start_v) { - machine_kexec_mask_interrupts(); - cpu_park(cpu); - - ops = get_cpu_ops(cpu); - if (ops && ops->cpu_die) - ops->cpu_die(cpu); - } -#endif + cpu_park_stop();
cpu_park_loop(); } @@ -1295,44 +1113,10 @@ void smp_send_stop(void) sdei_mask_local_cpu(); }
-#ifdef CONFIG_ARM64_CPU_PARK -int kexec_smp_send_park(void) +void smp_cross_send_stop(cpumask_t *mask) { - unsigned long cpu; - - if (WARN_ON(!kexec_in_progress)) { - pr_crit("%s called not in kexec progress.", __func__); - return -EPERM; - } - - if (mmap_cpu_park_mem() != 0) { - pr_info("no cpuparkmem, goto normal way."); - return -EPERM; - } - - local_irq_disable(); - - if (num_online_cpus() > 1) { - cpumask_t mask; - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - - for_each_cpu(cpu, &mask) - install_cpu_park(cpu); - smp_cross_call(&mask, IPI_CPU_STOP); - - /* Wait for other CPUs to park */ - for_each_cpu(cpu, &mask) - cpu_wait_park(cpu); - pr_info("smp park other cpus done\n"); - } - - sdei_mask_local_cpu(); - - return 0; + smp_cross_call(mask, IPI_CPU_STOP); } -#endif
#ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 90411356b8b2..2f3910beb4cf 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -44,6 +44,7 @@ #include <linux/sizes.h> #include <asm/tlb.h> #include <asm/alternative.h> +#include <asm/cpu_park.h>
#include "internal.h"
@@ -268,57 +269,6 @@ static void __init fdt_enforce_memory_region(void) memblock_add(usable_rgns[1].base, usable_rgns[1].size); }
-#ifdef CONFIG_ARM64_CPU_PARK -struct cpu_park_info park_info = { - .start = 0, - .len = PARK_SECTION_SIZE * NR_CPUS, - .start_v = 0, -}; - -static int __init parse_park_mem(char *p) -{ - if (!p) - return 0; - - park_info.start = PAGE_ALIGN(memparse(p, NULL)); - if (park_info.start == 0) - pr_info("cpu park mem params[%s]", p); - - return 0; -} -early_param("cpuparkmem", parse_park_mem); - -static int __init reserve_park_mem(void) -{ - if (park_info.start == 0 || park_info.len == 0) - return 0; - - park_info.start = PAGE_ALIGN(park_info.start); - park_info.len = PAGE_ALIGN(park_info.len); - - if (!memblock_is_region_memory(park_info.start, park_info.len)) { - pr_warn("cannot reserve park mem: region is not memory!"); - goto out; - } - - if (memblock_is_region_reserved(park_info.start, park_info.len)) { - pr_warn("cannot reserve park mem: region overlaps reserved memory!"); - goto out; - } - - memblock_remove(park_info.start, park_info.len); - pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)", - park_info.start, park_info.start + park_info.len, - park_info.len >> 20); - - return 0; -out: - park_info.start = 0; - park_info.len = 0; - return -EINVAL; -} -#endif - static int need_remove_real_memblock __initdata;
static int __init parse_memmap_one(char *p) @@ -542,9 +492,7 @@ void __init bootmem_init(void) * So reserve park memory firstly is better, but it may cause * crashkernel or quickkexec reserving failed. */ -#ifdef CONFIG_ARM64_CPU_PARK reserve_park_mem(); -#endif
/* * request_standard_resources() depends on crashkernel's memory being
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-net-5.17 commit 6dde452bceca3f2ed2b33bc46a16ff5682a03a2e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/commit/?id=6d...
----------------------------------------------------------------------
When multiple threads concurrently access the debugfs content, data and pointer exceptions may occur. Therefore, mutex lock protection is added for debugfs.
Fixes: 5e69ea7ee2a6 ("net: hns3: refactor the debugfs process") Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 2 ++ .../ethernet/hisilicon/hns3/hns3_debugfs.c | 20 +++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index cd0ef12b6e85..77d55a71f40e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -840,6 +840,8 @@ struct hnae3_handle {
u8 netdev_flags; struct dentry *hnae3_dbgfs; + /* protects concurrent contention between debugfs commands */ + struct mutex dbgfs_lock;
/* Network interface message level enabled bits */ u32 msg_enable; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 817e2e8a7287..f726a5b70f9e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1226,6 +1226,7 @@ static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, if (ret) return ret;
+ mutex_lock(&handle->dbgfs_lock); save_buf = &hns3_dbg_cmd[index].buf;
if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || @@ -1238,15 +1239,15 @@ static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, read_buf = *save_buf; } else { read_buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL); - if (!read_buf) - return -ENOMEM; + if (!read_buf) { + ret = -ENOMEM; + goto out; + }
/* save the buffer addr until the last read operation */ *save_buf = read_buf; - }
- /* get data ready for the first time to read */ - if (!*ppos) { + /* get data ready for the first time to read */ ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, read_buf, hns3_dbg_cmd[index].buf_len); if (ret) @@ -1255,8 +1256,10 @@ static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer,
size = simple_read_from_buffer(buffer, count, ppos, read_buf, strlen(read_buf)); - if (size > 0) + if (size > 0) { + mutex_unlock(&handle->dbgfs_lock); return size; + }
out: /* free the buffer for the last read operation */ @@ -1265,6 +1268,7 @@ static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, *save_buf = NULL; }
+ mutex_unlock(&handle->dbgfs_lock); return ret; }
@@ -1337,6 +1341,8 @@ int hns3_dbg_init(struct hnae3_handle *handle) debugfs_create_dir(hns3_dbg_dentry[i].name, handle->hnae3_dbgfs);
+ mutex_init(&handle->dbgfs_lock); + for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) { if ((hns3_dbg_cmd[i].cmd == HNAE3_DBG_CMD_TM_NODES && ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2) || @@ -1363,6 +1369,7 @@ int hns3_dbg_init(struct hnae3_handle *handle) return 0;
out: + mutex_destroy(&handle->dbgfs_lock); debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; return ret; @@ -1378,6 +1385,7 @@ void hns3_dbg_uninit(struct hnae3_handle *handle) hns3_dbg_cmd[i].buf = NULL; }
+ mutex_destroy(&handle->dbgfs_lock); debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; }
From: Yufeng Mo moyufeng@huawei.com
mainline inclusion from mainline-net-5.17 commit 2f61353cd2f789a4229b6f5c1c24a40a613357bb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/commit/?id=2f...
----------------------------------------------------------------------
Since some interrupt states may be cleared by hardware, the driver may receive an empty interrupt. Currently, the VF driver directly disables the vector0 interrupt in this case. As a result, the VF is unavailable. Therefore, the vector0 interrupt should be enabled in this case.
Fixes: b90fcc5bd904 ("net: hns3: add reset handling for VF when doing Core/Global/IMP reset") Signed-off-by: Yufeng Mo moyufeng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index a2ec2d8c2151..675f871a8df6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2052,8 +2052,7 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) break; }
- if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER) - hclgevf_enable_vector(&hdev->misc_vector, true); + hclgevf_enable_vector(&hdev->misc_vector, true);
return IRQ_HANDLED; }
From: Jian Shen shenjian15@huawei.com
mainline inclusion from mainline-net-5.17 commit ccb18f05535c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
If the MAC address A is configured to vport A and then vport B. The MAC address of vport A in the hardware becomes invalid. If the address of vport A is changed to MAC address B, the driver needs to delete the MAC address A of vport A. Due to the MAC address A of vport A has become invalid in the hardware entry, so "-ENOENT" is returned. In this case, the "used_umv_size" value recorded in driver is not updated. As a result, the MAC entry status of the software is inconsistent with that of the hardware.
Therefore, the driver updates the umv size even if the MAC entry cannot be found. Ensure that the software and hardware status is consistent.
Fixes: ee4bcd3b7ae4 ("net: hns3: refactor the MAC address configure") Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 1e4c89d4b96b..7f215530c501 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8438,12 +8438,11 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport, hnae3_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT0_EN_B, 0); hclge_prepare_mac_addr(&req, addr, false); ret = hclge_remove_mac_vlan_tbl(vport, &req); - if (!ret) { + if (!ret || ret == -ENOENT) { mutex_lock(&hdev->vport_lock); hclge_update_umv_space(vport, true); mutex_unlock(&hdev->vport_lock); - } else if (ret == -ENOENT) { - ret = 0; + return 0; }
return ret;
From: Jian Shen shenjian15@huawei.com
mainline inclusion from mainline-net-5.17 commit c0f46de30c96 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, Port base vlan is initiated by PF and configured to its VFs, by using command "ip link set <pf name> vf <vf id> vlan <vlan id>". When a global reset was triggered, the hardware vlan table and the soft recorded vlan information will be cleared by PF, and restored them until VFs were ready. There is a short time window between the table had been cleared and before table restored. If configured a new port base vlan tag at this moment, driver will check the soft recorded vlan information, and find there hasn't the old tag in it, which causing a warning print.
Due to the port base vlan is managed by PF, so the VFs's port base vlan restoring should be handled by PF when PF was ready.
This patch fixes it.
Fixes: 039ba863e8d7 ("net: hns3: optimize the filter table entries handling when resetting") Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 62 +++++++++++++------ .../hisilicon/hns3/hns3pf/hclge_main.h | 3 + 2 files changed, 46 insertions(+), 19 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 7f215530c501..3e78c158eca3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1872,6 +1872,7 @@ static int hclge_alloc_vport(struct hclge_dev *hdev) vport->vf_info.link_state = IFLA_VF_LINK_STATE_AUTO; vport->mps = HCLGE_MAC_DEFAULT_FRAME; vport->port_base_vlan_cfg.state = HNAE3_PORT_BASE_VLAN_DISABLE; + vport->port_base_vlan_cfg.tbl_sta = true; vport->rxvlan_cfg.rx_vlan_offload_en = true; vport->req_vlan_fltr_en = true; INIT_LIST_HEAD(&vport->vlan_list); @@ -9915,34 +9916,52 @@ void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev) } }
-void hclge_restore_vport_vlan_table(struct hclge_vport *vport) +void hclge_restore_vport_port_base_vlan_config(struct hclge_dev *hdev) { - struct hclge_vport_vlan_cfg *vlan, *tmp; - struct hclge_dev *hdev = vport->back; + struct hclge_vlan_info *vlan_info; + struct hclge_vport *vport; u16 vlan_proto; u16 vlan_id; u16 state; + int vf_id; int ret;
- vlan_proto = vport->port_base_vlan_cfg.vlan_info.vlan_proto; - vlan_id = vport->port_base_vlan_cfg.vlan_info.vlan_tag; - state = vport->port_base_vlan_cfg.state; + /* PF should restore all vfs port base vlan */ + for (vf_id = 0; vf_id < hdev->num_alloc_vfs; vf_id++) { + vport = &hdev->vport[vf_id + HCLGE_VF_VPORT_START_NUM]; + vlan_info = vport->port_base_vlan_cfg.tbl_sta ? + &vport->port_base_vlan_cfg.vlan_info : + &vport->port_base_vlan_cfg.old_vlan_info;
- if (state != HNAE3_PORT_BASE_VLAN_DISABLE) { - clear_bit(vport->vport_id, hdev->vlan_table[vlan_id]); - hclge_set_vlan_filter_hw(hdev, htons(vlan_proto), - vport->vport_id, vlan_id, - false); - return; + vlan_id = vlan_info->vlan_tag; + vlan_proto = vlan_info->vlan_proto; + state = vport->port_base_vlan_cfg.state; + + if (state != HNAE3_PORT_BASE_VLAN_DISABLE) { + clear_bit(vport->vport_id, hdev->vlan_table[vlan_id]); + ret = hclge_set_vlan_filter_hw(hdev, htons(vlan_proto), + vport->vport_id, + vlan_id, false); + vport->port_base_vlan_cfg.tbl_sta = ret == 0; + } } +}
- list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { - ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), - vport->vport_id, - vlan->vlan_id, false); - if (ret) - break; - vlan->hd_tbl_status = true; +void hclge_restore_vport_vlan_table(struct hclge_vport *vport) +{ + struct hclge_vport_vlan_cfg *vlan, *tmp; + struct hclge_dev *hdev = vport->back; + int ret; + + if (vport->port_base_vlan_cfg.state == HNAE3_PORT_BASE_VLAN_DISABLE) { + list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { + ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), + vport->vport_id, + vlan->vlan_id, false); + if (ret) + break; + vlan->hd_tbl_status = true; + } } }
@@ -9983,6 +10002,7 @@ static void hclge_restore_hw_table(struct hclge_dev *hdev) struct hnae3_handle *handle = &vport->nic;
hclge_restore_mac_table_common(vport); + hclge_restore_vport_port_base_vlan_config(hdev); hclge_restore_vport_vlan_table(vport); set_bit(HCLGE_STATE_FD_USER_DEF_CHANGED, &hdev->state); hclge_restore_fd_entries(handle); @@ -10039,6 +10059,8 @@ static int hclge_update_vlan_filter_entries(struct hclge_vport *vport, false); }
+ vport->port_base_vlan_cfg.tbl_sta = false; + /* force add VLAN 0 */ ret = hclge_set_vf_vlan_common(hdev, vport->vport_id, false, 0); if (ret) @@ -10128,7 +10150,9 @@ int hclge_update_port_base_vlan_cfg(struct hclge_vport *vport, u16 state, else nic->port_base_vlan_state = HNAE3_PORT_BASE_VLAN_ENABLE;
+ vport->port_base_vlan_cfg.old_vlan_info = *old_vlan_info; vport->port_base_vlan_cfg.vlan_info = *vlan_info; + vport->port_base_vlan_cfg.tbl_sta = true; hclge_set_vport_vlan_fltr_change(vport);
return 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index f7f5a4b09068..e817765f29d3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -985,7 +985,9 @@ struct hclge_vlan_info {
struct hclge_port_base_vlan_config { u16 state; + bool tbl_sta; struct hclge_vlan_info vlan_info; + struct hclge_vlan_info old_vlan_info; };
struct hclge_vf_info { @@ -1105,6 +1107,7 @@ void hclge_rm_vport_all_mac_table(struct hclge_vport *vport, bool is_del_list, void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list); void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev); void hclge_restore_mac_table_common(struct hclge_vport *vport); +void hclge_restore_vport_port_base_vlan_config(struct hclge_dev *hdev); void hclge_restore_vport_vlan_table(struct hclge_vport *vport); int hclge_update_port_base_vlan_cfg(struct hclge_vport *vport, u16 state, struct hclge_vlan_info *vlan_info);
From: Jian Shen shenjian15@huawei.com
mainline inclusion from mainline-net-5.17 commit 1932a624ab88 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When adding port base VLAN, vf VLAN need to remove from HW and modify the vlan state in vf VLAN list as false. If the periodicity task is freeing the same node, it may cause "use after free" error. This patch adds a vlan list lock to protect the vlan list.
Fixes: c6075b193462 ("net: hns3: Record VF vlan tables") Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 38 +++++++++++++++++-- .../hisilicon/hns3/hns3pf/hclge_main.h | 1 + 2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 3e78c158eca3..69ec7e826a2b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9818,19 +9818,28 @@ static void hclge_add_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, bool writen_to_tbl) { struct hclge_vport_vlan_cfg *vlan, *tmp; + struct hclge_dev *hdev = vport->back;
- list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) - if (vlan->vlan_id == vlan_id) + mutex_lock(&hdev->vport_lock); + + list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { + if (vlan->vlan_id == vlan_id) { + mutex_unlock(&hdev->vport_lock); return; + } + }
vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); - if (!vlan) + if (!vlan) { + mutex_unlock(&hdev->vport_lock); return; + }
vlan->hd_tbl_status = writen_to_tbl; vlan->vlan_id = vlan_id;
list_add_tail(&vlan->node, &vport->vlan_list); + mutex_unlock(&hdev->vport_lock); }
static int hclge_add_vport_all_vlan_table(struct hclge_vport *vport) @@ -9839,6 +9848,8 @@ static int hclge_add_vport_all_vlan_table(struct hclge_vport *vport) struct hclge_dev *hdev = vport->back; int ret;
+ mutex_lock(&hdev->vport_lock); + list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { if (!vlan->hd_tbl_status) { ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), @@ -9848,12 +9859,16 @@ static int hclge_add_vport_all_vlan_table(struct hclge_vport *vport) dev_err(&hdev->pdev->dev, "restore vport vlan list failed, ret=%d\n", ret); + + mutex_unlock(&hdev->vport_lock); return ret; } } vlan->hd_tbl_status = true; }
+ mutex_unlock(&hdev->vport_lock); + return 0; }
@@ -9863,6 +9878,8 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, struct hclge_vport_vlan_cfg *vlan, *tmp; struct hclge_dev *hdev = vport->back;
+ mutex_lock(&hdev->vport_lock); + list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { if (vlan->vlan_id == vlan_id) { if (is_write_tbl && vlan->hd_tbl_status) @@ -9877,6 +9894,8 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, break; } } + + mutex_unlock(&hdev->vport_lock); }
void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list) @@ -9884,6 +9903,8 @@ void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list) struct hclge_vport_vlan_cfg *vlan, *tmp; struct hclge_dev *hdev = vport->back;
+ mutex_lock(&hdev->vport_lock); + list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { if (vlan->hd_tbl_status) hclge_set_vlan_filter_hw(hdev, @@ -9899,6 +9920,7 @@ void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list) } } clear_bit(vport->vport_id, hdev->vf_vlan_full); + mutex_unlock(&hdev->vport_lock); }
void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev) @@ -9907,6 +9929,8 @@ void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev) struct hclge_vport *vport; int i;
+ mutex_lock(&hdev->vport_lock); + for (i = 0; i < hdev->num_alloc_vport; i++) { vport = &hdev->vport[i]; list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { @@ -9914,6 +9938,8 @@ void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev) kfree(vlan); } } + + mutex_unlock(&hdev->vport_lock); }
void hclge_restore_vport_port_base_vlan_config(struct hclge_dev *hdev) @@ -9953,6 +9979,8 @@ void hclge_restore_vport_vlan_table(struct hclge_vport *vport) struct hclge_dev *hdev = vport->back; int ret;
+ mutex_lock(&hdev->vport_lock); + if (vport->port_base_vlan_cfg.state == HNAE3_PORT_BASE_VLAN_DISABLE) { list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), @@ -9963,6 +9991,8 @@ void hclge_restore_vport_vlan_table(struct hclge_vport *vport) vlan->hd_tbl_status = true; } } + + mutex_unlock(&hdev->vport_lock); }
/* For global reset and imp reset, hardware will clear the mac table, @@ -11861,8 +11891,8 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_misc_irq_uninit(hdev); hclge_devlink_uninit(hdev); hclge_pci_uninit(hdev); - mutex_destroy(&hdev->vport_lock); hclge_uninit_vport_vlan_table(hdev); + mutex_destroy(&hdev->vport_lock); ae_dev->priv = NULL; }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index e817765f29d3..025fd73ea485 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -1033,6 +1033,7 @@ struct hclge_vport { spinlock_t mac_list_lock; /* protect mac address need to add/detele */ struct list_head uc_mac_list; /* Store VF unicast table */ struct list_head mc_mac_list; /* Store VF multicast table */ + struct list_head vlan_list; /* Store VF vlan table */ };
From: Jian Shen shenjian15@huawei.com
mainline inclusion from mainline-net-5.17 commit 190cd8a72b01 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, when PF set VF VLAN, it sends notify mailbox to VF if VF alive. VF stop its traffic, and send request mailbox to PF, then PF updates VF VLAN. It's a bit complex. If VF is killed before sending request, PF will not set VF VLAN without any log.
This patch refines the process, PF can set VF VLAN direclty, and then notify the VF. If VF is resetting at that time, the notify may be dropped, so VF should query it after reset finished.
Fixes: 92f11ea177cd ("net: hns3: fix set port based VLAN issue for VF") Signed-off-by: Jian Shen shenjian15@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../hisilicon/hns3/hns3pf/hclge_main.c | 18 +++++++++++++----- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 5 +++++ 2 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 69ec7e826a2b..819d0ba6749e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8993,11 +8993,16 @@ static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf,
ether_addr_copy(vport->vf_info.mac, mac_addr);
+ /* there is a timewindow for PF to know VF unalive, it may + * cause send mailbox fail, but it doesn't matter, VF will + * query it when reinit. + */ if (test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) { dev_info(&hdev->pdev->dev, "MAC of VF %d has been set to %s, and it will be reinitialized!\n", vf, format_mac_addr); - return hclge_inform_reset_assert_to_vf(vport); + (void)hclge_inform_reset_assert_to_vf(vport); + return 0; }
dev_info(&hdev->pdev->dev, "MAC of VF %d has been set to %s\n", @@ -10250,14 +10255,17 @@ static int hclge_set_vf_vlan_filter(struct hnae3_handle *handle, int vfid, return ret; }
- /* for DEVICE_VERSION_V3, vf doesn't need to know about the port based + /* there is a timewindow for PF to know VF unalive, it may + * cause send mailbox fail, but it doesn't matter, VF will + * query it when reinit. + * for DEVICE_VERSION_V3, vf doesn't need to know about the port based * VLAN state. */ if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3 && test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) - hclge_push_vf_port_base_vlan_info(&hdev->vport[0], - vport->vport_id, state, - &vlan_info); + (void)hclge_push_vf_port_base_vlan_info(&hdev->vport[0], + vport->vport_id, + state, &vlan_info);
return 0; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 675f871a8df6..c956da60d90a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2862,6 +2862,11 @@ static int hclgevf_reset_hdev(struct hclgevf_dev *hdev) return ret; }
+ /* get current port based vlan state from PF */ + ret = hclgevf_get_port_base_vlan_filter_state(hdev); + if (ret) + return ret; + set_bit(HCLGEVF_STATE_PROMISC_CHANGED, &hdev->state);
hclgevf_init_rxd_adv_layout(hdev);
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-net-5.17 commit 877837211802 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When use ethtoool set tx copybreak buf size to a large value which causes order exceeding 10 or memory is not enough, it causes allocating tx copybreak buffer failed and print "the active tx spare buf is 0, not enabled tx spare buffer", however, use --get-tunable parameter query tx copybreak buf size and it indicates setting value not 0.
So, it's necessary to change the print value from setting value to 0.
Set kinfo.tx_spare_buf_size to 0 when set tx copybreak buf size failed.
Fixes: e445f08af2b1 ("net: hns3: add support to set/get tx copybreak buf size via ethtool for hns3 driver") Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 20 +++++++++++-------- .../ethernet/hisilicon/hns3/hns3_ethtool.c | 3 ++- 2 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index edaf84e7fc85..ffdda90a30a7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1027,13 +1027,12 @@ static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring,
static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) { + u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size; struct hns3_tx_spare *tx_spare; struct page *page; - u32 alloc_size; dma_addr_t dma; int order;
- alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size; if (!alloc_size) return;
@@ -1043,30 +1042,35 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) if (!tx_spare) { /* The driver still work without the tx spare buffer */ dev_warn(ring_to_dev(ring), "failed to allocate hns3_tx_spare\n"); - return; + goto devm_kzalloc_error; }
page = alloc_pages_node(dev_to_node(ring_to_dev(ring)), GFP_KERNEL, order); if (!page) { dev_warn(ring_to_dev(ring), "failed to allocate tx spare pages\n"); - devm_kfree(ring_to_dev(ring), tx_spare); - return; + goto alloc_pages_error; }
dma = dma_map_page(ring_to_dev(ring), page, 0, PAGE_SIZE << order, DMA_TO_DEVICE); if (dma_mapping_error(ring_to_dev(ring), dma)) { dev_warn(ring_to_dev(ring), "failed to map pages for tx spare\n"); - put_page(page); - devm_kfree(ring_to_dev(ring), tx_spare); - return; + goto dma_mapping_error; }
tx_spare->dma = dma; tx_spare->buf = page_address(page); tx_spare->len = PAGE_SIZE << order; ring->tx_spare = tx_spare; + return; + +dma_mapping_error: + put_page(page); +alloc_pages_error: + devm_kfree(ring_to_dev(ring), tx_spare); +devm_kzalloc_error: + ring->tqp->handle->kinfo.tx_spare_buf_size = 0; }
/* Use hns3_tx_spare_space() to make sure there is enough buffer diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index a7cf5fee9f48..2fdfaeff9c0e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1835,7 +1835,8 @@ static int hns3_set_tunable(struct net_device *netdev, old_tx_spare_buf_size = h->kinfo.tx_spare_buf_size; new_tx_spare_buf_size = *(u32 *)data; ret = hns3_set_tx_spare_buf_size(netdev, new_tx_spare_buf_size); - if (ret) { + if (ret || + (!priv->ring->tx_spare && new_tx_spare_buf_size != 0)) { int ret1;
netdev_warn(netdev,
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-net-5.17 commit a89cbb16995b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Add max order judgement for tx spare buffer to avoid triggering call trace, print related fail information instead, when user set tx spare buf size to a large value which causes order exceeding 10.
Fixes: e445f08af2b1 ("net: hns3: add support to set/get tx copybreak buf size via ethtool for hns3 driver") Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index ffdda90a30a7..d7be9291def1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1037,6 +1037,12 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) return;
order = get_order(alloc_size); + if (order >= MAX_ORDER) { + if (net_ratelimit()) + dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n"); + return; + } + tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare), GFP_KERNEL); if (!tx_spare) {
From: Peng Li lipeng321@huawei.com
mainline inclusion from mainline-net-5.17 commit 671cb8cbb9c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
After disable sriov, VF still has some config and info need to be cleaned, which configured by PF. This patch clean the HW config and SW struct vport->vf_info.
Fixes: fa8d82e853e8 ("net: hns3: Add support of .sriov_configure in HNS3 driver") Signed-off-by: Peng Lilipeng321@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 3 ++ .../net/ethernet/hisilicon/hns3/hns3_enet.c | 18 +++++++ .../hisilicon/hns3/hns3pf/hclge_main.c | 50 +++++++++++++++++++ 3 files changed, 71 insertions(+)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 77d55a71f40e..6618ab32ab25 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -536,6 +536,8 @@ struct hnae3_ae_dev { * Get 1588 rx hwstamp * get_ts_info * Get phc info + * clean_vf_config + * Clean residual vf info after disable sriov */ struct hnae3_ae_ops { int (*init_ae_dev)(struct hnae3_ae_dev *ae_dev); @@ -729,6 +731,7 @@ struct hnae3_ae_ops { struct ethtool_ts_info *info); int (*get_link_diagnosis_info)(struct hnae3_handle *handle, u32 *status_code); + void (*clean_vf_config)(struct hnae3_ae_dev *ae_dev, int num_vfs); };
struct hnae3_dcb_ops { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index d7be9291def1..e3911116a88f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3059,6 +3059,21 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return ret; }
+/** + * hns3_clean_vf_config + * @pdev: pointer to a pci_dev structure + * @num_vfs: number of VFs allocated + * + * Clean residual vf config after disable sriov + **/ +static void hns3_clean_vf_config(struct pci_dev *pdev, int num_vfs) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); + + if (ae_dev->ops->clean_vf_config) + ae_dev->ops->clean_vf_config(ae_dev, num_vfs); +} + /* hns3_remove - Device removal routine * @pdev: PCI device information struct */ @@ -3097,7 +3112,10 @@ static int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) else return num_vfs; } else if (!pci_vfs_assigned(pdev)) { + int num_vfs_pre = pci_num_vf(pdev); + pci_disable_sriov(pdev); + hns3_clean_vf_config(pdev, num_vfs_pre); } else { dev_warn(&pdev->dev, "Unable to free VFs because some are assigned to VMs.\n"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 819d0ba6749e..db8455955a00 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12724,6 +12724,55 @@ static int hclge_get_link_diagnosis_info(struct hnae3_handle *handle, return 0; }
+/* After disable sriov, VF still has some config and info need clean, + * which configed by PF. + */ +static void hclge_clear_vport_vf_info(struct hclge_vport *vport, int vfid) +{ + struct hclge_dev *hdev = vport->back; + struct hclge_vlan_info vlan_info; + int ret; + + /* after disable sriov, clean VF rate configured by PF */ + ret = hclge_tm_qs_shaper_cfg(vport, 0); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to clean vf%d rate config, ret = %d\n", + vfid, ret); + + vlan_info.vlan_tag = 0; + vlan_info.qos = 0; + vlan_info.vlan_proto = ETH_P_8021Q; + ret = hclge_update_port_base_vlan_cfg(vport, + HNAE3_PORT_BASE_VLAN_DISABLE, + &vlan_info); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to clean vf%d port base vlan, ret = %d\n", + vfid, ret); + + ret = hclge_set_vf_spoofchk_hw(hdev, vport->vport_id, false); + if (ret) + dev_err(&hdev->pdev->dev, + "failed to clean vf%d spoof config, ret = %d\n", + vfid, ret); + + memset(&vport->vf_info, 0, sizeof(vport->vf_info)); +} + +static void hclge_clean_vport_config(struct hnae3_ae_dev *ae_dev, int num_vfs) +{ + struct hclge_dev *hdev = ae_dev->priv; + struct hclge_vport *vport; + int i; + + for (i = 0; i < num_vfs; i++) { + vport = &hdev->vport[i + HCLGE_VF_VPORT_START_NUM]; + + hclge_clear_vport_vf_info(vport, i); + } +} + static const struct hnae3_ae_ops hclge_ops = { .init_ae_dev = hclge_init_ae_dev, .uninit_ae_dev = hclge_uninit_ae_dev, @@ -12825,6 +12874,7 @@ static const struct hnae3_ae_ops hclge_ops = { .get_rx_hwts = hclge_ptp_get_rx_hwts, .get_ts_info = hclge_ptp_get_ts_info, .get_link_diagnosis_info = hclge_get_link_diagnosis_info, + .clean_vf_config = hclge_clean_vport_config, };
static struct hnae3_ae_algo ae_algo = {
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-net-5.17 commit f5cd60169f98 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When pci device reset failed, it does uninit operation and priv->ring is NULL, it causes accessing NULL pointer error.
Add netdev reset check for hns3_set_tunable() to fix it.
Fixes: 99f6b5fb5f63 ("net: hns3: use bounce buffer when rx page can not be reused") Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 2fdfaeff9c0e..b489c5993ad7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1783,9 +1783,6 @@ static int hns3_set_tx_spare_buf_size(struct net_device *netdev, struct hnae3_handle *h = priv->ae_handle; int ret;
- if (hns3_nic_resetting(netdev)) - return -EBUSY; - h->kinfo.tx_spare_buf_size = data;
ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT); @@ -1816,6 +1813,11 @@ static int hns3_set_tunable(struct net_device *netdev, struct hnae3_handle *h = priv->ae_handle; int i, ret = 0;
+ if (hns3_nic_resetting(netdev) || !priv->ring) { + netdev_err(netdev, "failed to set tunable value, dev resetting!"); + return -EBUSY; + } + switch (tuna->id) { case ETHTOOL_TX_COPYBREAK: priv->tx_copybreak = *(u32 *)data;
From: Hao Chen chenhao288@hisilicon.com
mainline inclusion from mainline-net-5.17 commit 4d07c5936c25 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When pci devices init failed and haven't reinit, priv->ring is NULL and hns3_set/get_ringparam() will access priv->ring. it causes call trace.
So, add NULL pointer check for hns3_set/get_ringparam() to avoid this situation.
Fixes: 5668abda0931 ("net: hns3: add support for set_ringparam") Signed-off-by: Hao Chen chenhao288@hisilicon.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index b489c5993ad7..69cee085ddee 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -670,8 +670,8 @@ static void hns3_get_ringparam(struct net_device *netdev, struct hnae3_handle *h = priv->ae_handle; int rx_queue_index = h->kinfo.num_tqps;
- if (hns3_nic_resetting(netdev)) { - netdev_err(netdev, "dev resetting!"); + if (hns3_nic_resetting(netdev) || !priv->ring) { + netdev_err(netdev, "failed to get ringparam value, due to dev resetting or uninited\n"); return; }
@@ -1091,8 +1091,14 @@ static int hns3_check_ringparam(struct net_device *ndev, { #define RX_BUF_LEN_2K 2048 #define RX_BUF_LEN_4K 4096 - if (hns3_nic_resetting(ndev)) + + struct hns3_nic_priv *priv = netdev_priv(ndev); + + if (hns3_nic_resetting(ndev) || !priv->ring) { + netdev_err(ndev, "failed to set ringparam value, due to dev resetting or uninited\n"); return -EBUSY; + } +
if (param->rx_mini_pending || param->rx_jumbo_pending) return -EINVAL;
From: Guangbin Huang huangguangbin2@huawei.com
mainline inclusion from mainline-net-5.17 commit ad0ecaef6a2c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4YXIM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, function hclge_mdio_read() will return 0 if during reset(the cmd state will be set to disable).
If use general phy driver, the phy_state_machine() will update phy speed every second in function genphy_read_status_fixed() when PHY is set to autoneg off, no matter of link down or link up.
If phy driver happens to read BMCR register during reset, phy speed will be updated to 10Mpbs as BMCR register value is 0. So it may call phy can not link up if previous speed is not 10Mpbs.
To fix this problem, function hclge_mdio_read() should return -EBUSY if the cmd state is disable. So does function hclge_mdio_write().
Fixes: 1c1249380992 ("net: hns3: bugfix for hclge_mdio_write and hclge_mdio_read") Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yue Haibing yuehaibing@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c index 63d2be4349e3..03d63b6a9b2b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c @@ -48,7 +48,7 @@ static int hclge_mdio_write(struct mii_bus *bus, int phyid, int regnum, int ret;
if (test_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state)) - return 0; + return -EBUSY;
hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MDIO_CONFIG, false);
@@ -86,7 +86,7 @@ static int hclge_mdio_read(struct mii_bus *bus, int phyid, int regnum) int ret;
if (test_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state)) - return 0; + return -EBUSY;
hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MDIO_CONFIG, true);
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4X0YD?from=project-issue CVE: NA
--------
Export "memory.events" and "memory.events.local" from cgroupv2 to cgroupv1.
There are some differences between v2 and v1:
1)events of MEMCG_OOM_GROUP_KILL is not included in cgroupv1. Because, there is no member of memory.oom.group.
2)events of MEMCG_MAX is represented with "limit_in_bytes" in cgroupv1 instead of memory.max
3)event of oom_kill is include in memory.oom_control. make oom_kill include its descendants' events and add oom_kill_local include its oom_kill event only.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 2 -- mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e55013c570d..23f4a3c8fef1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1214,8 +1214,6 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, else cgroup_file_notify(&memcg->events_file);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && diff --git a/mm/memcontrol.c b/mm/memcontrol.c index daed900a666e..c1ff95b7a82d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4564,6 +4564,9 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); seq_printf(sf, "oom_kill %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); + seq_printf(sf, "oom_kill_local %lu\n", + atomic_long_read(&memcg->memory_events_local[MEMCG_OOM_KILL])); + return 0; }
@@ -5124,6 +5127,31 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, return nbytes; }
+static void __memcg_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "limit_in_bytes %lu\n", + atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); +} + +static int memcg_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memcg_events_show(m, memcg->memory_events); + return 0; +} + +static int memcg_events_local_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memcg_events_show(m, memcg->memory_events_local); + return 0; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5305,6 +5333,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_high_show, .write = memory_high_write, }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_file), + .seq_show = memcg_events_show, + }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memcg_events_local_show, + }, { }, /* terminate */ };
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4X0YD?from=project-issue CVE: NA
--------
Since memory.event is fully supported in cgroupv1, the problem of inconsistent oom event behavior for OOM_MEMCG_KILL occurs again. We fix the problem by add a new condition to support the event adding continue. Therefore, there are two condition: 1) memcg is not root memcg; 2) the memcg is root memcg and the event is OOM_MEMCG_KILL of cgroupv1
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 23f4a3c8fef1..9d3a952024a6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1197,6 +1197,18 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_unlock(); }
+static bool memcg_event_add(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (!mem_cgroup_is_root(memcg)) + return true; + + if (event == MEMCG_OOM_KILL && !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; + + return false; +} + static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { @@ -1217,7 +1229,7 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); + memcg_event_add(memcg, event)); }
static inline void memcg_memory_event_mm(struct mm_struct *mm,
From: Christian Brauner christian.brauner@ubuntu.com
mainline inclusion from mianline-v5.14-rc1 commit 661ee6280931548f7b3b887ad26a157474ae5ac4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4X0YD?from=project-issue CVE: NA
-----------------------------------
Introduce the cgroup.kill file. It does what it says on the tin and allows a caller to kill a cgroup by writing "1" into cgroup.kill. The file is available in non-root cgroups.
Killing cgroups is a process directed operation, i.e. the whole thread-group is affected. Consequently trying to write to cgroup.kill in threaded cgroups will be rejected and EOPNOTSUPP returned. This behavior aligns with cgroup.procs where reads in threaded-cgroups are rejected with EOPNOTSUPP.
The cgroup.kill file is write-only since killing a cgroup is an event not which makes it different from e.g. freezer where a cgroup transitions between the two states.
As with all new cgroup features cgroup.kill is recursive by default.
Killing a cgroup is protected against concurrent migrations through the cgroup mutex. To protect against forkbombs and to mitigate the effect of racing forks a new CGRP_KILL css set lock protected flag is introduced that is set prior to killing a cgroup and unset after the cgroup has been killed. We can then check in cgroup_post_fork() where we hold the css set lock already whether the cgroup is currently being killed. If so we send the child a SIGKILL signal immediately taking it down as soon as it returns to userspace. To make the killing of the child semantically clean it is killed after all cgroup attachment operations have been finalized.
There are various use-cases of this interface: - Containers usually have a conservative layout where each container usually has a delegated cgroup. For such layouts there is a 1:1 mapping between container and cgroup. If the container in addition uses a separate pid namespace then killing a container usually becomes a simple kill -9 <container-init-pid> from an ancestor pid namespace. However, there are quite a few scenarios where that isn't true. For example, there are containers that share the cgroup with other processes on purpose that are supposed to be bound to the lifetime of the container but are not in the same pidns of the container. Containers that are in a delegated cgroup but share the pid namespace with the host or other containers. - Service managers such as systemd use cgroups to group and organize processes belonging to a service. They usually rely on a recursive algorithm now to kill a service. With cgroup.kill this becomes a simple write to cgroup.kill. - Userspace OOM implementations can make good use of this feature to efficiently take down whole cgroups quickly. - The kill program can gain a new kill --cgroup /sys/fs/cgroup/delegated flag to take down cgroups.
A few observations about the semantics: - If parent and child are in the same cgroup and CLONE_INTO_CGROUP is not specified we are not taking cgroup mutex meaning the cgroup can be killed while a process in that cgroup is forking. If the kill request happens right before cgroup_can_fork() and before the parent grabs its siglock the parent is guaranteed to see the pending SIGKILL. In addition we perform another check in cgroup_post_fork() whether the cgroup is being killed and is so take down the child (see above). This is robust enough and protects gainst forkbombs. If userspace really really wants to have stricter protection the simple solution would be to grab the write side of the cgroup threadgroup rwsem which will force all ongoing forks to complete before killing starts. We concluded that this is not necessary as the semantics for concurrent forking should simply align with freezer where a similar check as cgroup_post_fork() is performed.
For all other cases CLONE_INTO_CGROUP is required. In this case we will grab the cgroup mutex so the cgroup can't be killed while we fork. Once we're done with the fork and have dropped cgroup mutex we are visible and will be found by any subsequent kill request. - We obviously don't kill kthreads. This means a cgroup that has a kthread will not become empty after killing and consequently no unpopulated event will be generated. The assumption is that kthreads should be in the root cgroup only anyway so this is not an issue. - We skip killing tasks that already have pending fatal signals. - Freezer doesn't care about tasks in different pid namespaces, i.e. if you have two tasks in different pid namespaces the cgroup would still be frozen. The cgroup.kill mechanism consequently behaves the same way, i.e. we kill all processes and ignore in which pid namespace they exist. - If the caller is located in a cgroup that is killed the caller will obviously be killed as well.
Link: https://lore.kernel.org/r/20210503143922.3093755-1-brauner@kernel.org Cc: Shakeel Butt shakeelb@google.com Cc: Roman Gushchin guro@fb.com Cc: Tejun Heo tj@kernel.org Cc: cgroups@vger.kernel.org Reviewed-by: Shakeel Butt shakeelb@google.com Reviewed-by: Serge Hallyn serge@hallyn.com Acked-by: Roman Gushchin guro@fb.com Signed-off-by: Christian Brauner christian.brauner@ubuntu.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/cgroup-defs.h | 3 + kernel/cgroup/cgroup.c | 127 ++++++++++++++++++++++++++++++++---- 2 files changed, 116 insertions(+), 14 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 648c2e1ec442..55b9a3924cd7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -72,6 +72,9 @@ enum {
/* Cgroup is frozen. */ CGRP_FROZEN, + + /* Control group has to be killed. */ + CGRP_KILL, };
/* cgroup_root->flags */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 59cc82ef52a6..7dd4e18405b5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3767,6 +3767,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, return nbytes; }
+static void __cgroup_kill(struct cgroup *cgrp) +{ + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&cgroup_mutex); + + spin_lock_irq(&css_set_lock); + set_bit(CGRP_KILL, &cgrp->flags); + spin_unlock_irq(&css_set_lock); + + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); + while ((task = css_task_iter_next(&it))) { + /* Ignore kernel threads here. */ + if (task->flags & PF_KTHREAD) + continue; + + /* Skip tasks that are already dying. */ + if (__fatal_signal_pending(task)) + continue; + + send_sig(SIGKILL, task, 0); + } + css_task_iter_end(&it); + + spin_lock_irq(&css_set_lock); + clear_bit(CGRP_KILL, &cgrp->flags); + spin_unlock_irq(&css_set_lock); +} + +static void cgroup_kill(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *css; + struct cgroup *dsct; + + lockdep_assert_held(&cgroup_mutex); + + cgroup_for_each_live_descendant_pre(dsct, css, cgrp) + __cgroup_kill(dsct); +} + +static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + ssize_t ret = 0; + int kill; + struct cgroup *cgrp; + + ret = kstrtoint(strstrip(buf), 0, &kill); + if (ret) + return ret; + + if (kill != 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* + * Killing is a process directed operation, i.e. the whole thread-group + * is taken down so act like we do for cgroup.procs and only make this + * writable in non-threaded cgroups. + */ + if (cgroup_is_threaded(cgrp)) + ret = -EOPNOTSUPP; + else + cgroup_kill(cgrp); + + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); @@ -4988,6 +5062,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_freeze_show, .write = cgroup_freeze_write, }, + { + .name = "cgroup.kill", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_kill_write, + }, { .name = "cpu.stat", .seq_show = cpu_stat_show, @@ -6227,6 +6306,8 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned long cgrp_flags = 0; + bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; @@ -6238,6 +6319,11 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */ if (likely(child->pid)) { + if (kargs->cgrp) + cgrp_flags = kargs->cgrp->flags; + else + cgrp_flags = cset->dfl_cgrp->flags; + WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); @@ -6246,23 +6332,32 @@ void cgroup_post_fork(struct task_struct *child, cset = NULL; }
- /* - * If the cgroup has to be frozen, the new task has too. Let's set - * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the - * frozen state. - */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); + if (!(child->flags & PF_KTHREAD)) { + if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { + /* + * If the cgroup has to be frozen, the new task has + * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to + * get the task into the frozen state. + */ + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient + * switch from the frozen state and back. + */ + }
/* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later from - * do_freezer_trap(). So we avoid cgroup's transient switch - * from the frozen state and back. + * If the cgroup is to be killed notice it now and take the + * child down right after we finished preparing it for + * userspace. */ + kill = test_bit(CGRP_KILL, &cgrp_flags); }
spin_unlock_irq(&css_set_lock); @@ -6285,6 +6380,10 @@ void cgroup_post_fork(struct task_struct *child, put_css_set(rcset); }
+ /* Cgroup has to be killed so take down child immediately. */ + if (unlikely(kill)) + do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); + cgroup_css_set_put_fork(kargs); }
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4X0YD?from=project-issue CVE: NA
--------
Export cgroup.kill feature from cgroupv2 to cgroupv1. Therefore, user can kill all process in one cgroup and its subcgroups instead of kill them one by one.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/cgroup/cgroup-internal.h | 3 +++ kernel/cgroup/cgroup-v1.c | 5 +++++ kernel/cgroup/cgroup.c | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 6e36e854b512..3f116765bb00 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -266,6 +266,9 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp);
+ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + /* * rstat.c */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 9f5221653f80..ff965ca9ca21 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -658,6 +658,11 @@ struct cftype cgroup1_base_files[] = { .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, + { + .name = "cgroup.kill", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_kill_write, + }, { } /* terminate */ };
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7dd4e18405b5..87cd2bb75307 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3808,8 +3808,8 @@ static void cgroup_kill(struct cgroup *cgrp) __cgroup_kill(dsct); }
-static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) +ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) { ssize_t ret = 0; int kill;
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit ae085d7f9365de7da27ab5c0d16b12d51ea7fca9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I50GZX CVE: NA
-----------------------------------
The objcg is not cleared and put for kfence object when it is freed, which could lead to memory leak for struct obj_cgroup and wrong statistics of NR_SLAB_RECLAIMABLE_B or NR_SLAB_UNRECLAIMABLE_B.
Since the last freed object's objcg is not cleared, mem_cgroup_from_obj() could return the wrong memcg when this kfence object, which is not charged to any objcgs, is reallocated to other users.
A real word issue [1] is caused by this bug.
Link: https://lore.kernel.org/all/000000000000cabcb505dae9e577@google.com/ [1] Reported-by: syzbot+f8c45ccc7d5d45fc5965@syzkaller.appspotmail.com Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Muchun Song songmuchun@bytedance.com Cc: Dmitry Vyukov dvyukov@google.com Cc: Marco Elver elver@google.com Cc: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/slab.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/mm/slab.c b/mm/slab.c index d152f910da26..ae84578f3fde 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3428,6 +3428,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, { if (is_kfence_address(objp)) { kmemleak_free_recursive(objp, cachep->flags); + memcg_slab_free_hook(cachep, &objp, 1); __kfence_free(objp); return; }
From: Arnd Bergmann arnd@arndb.de
mainline inclusion from mainline-v5.16-rc1 commit c2e6df3eaaf120cde5e7c3a70590dd82e427458a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I50HG3 CVE: NA
--------------------------------
pgd_page_vaddr() returns an 'unsigned long' address, causing a warning with the memcpy() call in kasan_init():
arch/arm/mm/kasan_init.c: In function 'kasan_init': include/asm-generic/pgtable-nop4d.h:44:50: error: passing argument 2 of '__memcpy' makes pointer from integer without a cast [-Werror=int-conversion] 44 | #define pgd_page_vaddr(pgd) ((unsigned long)(p4d_pgtable((p4d_t){ pgd }))) | ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | long unsigned int arch/arm/include/asm/string.h:58:45: note: in definition of macro 'memcpy' 58 | #define memcpy(dst, src, len) __memcpy(dst, src, len) | ^~~ arch/arm/mm/kasan_init.c:229:16: note: in expansion of macro 'pgd_page_vaddr' 229 | pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_START)), | ^~~~~~~~~~~~~~ arch/arm/include/asm/string.h:21:47: note: expected 'const void *' but argument is of type 'long unsigned int' 21 | extern void *__memcpy(void *dest, const void *src, __kernel_size_t n); | ~~~~~~~~~~~~^~~
Avoid this by adding an explicit typecast.
Link: https://lore.kernel.org/all/CACRpkdb3DMvof3-xdtss0Pc6KM36pJA-iy=WhvtNVnsDpeJ...
Fixes: 5615f69bc209 ("ARM: 9016/2: Initialize the mapping of KASan shadow memory") Reviewed-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Russell King (Oracle) rmk+kernel@armlinux.org.uk Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm/mm/kasan_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm/mm/kasan_init.c b/arch/arm/mm/kasan_init.c index 9c348042a724..4b1619584b23 100644 --- a/arch/arm/mm/kasan_init.c +++ b/arch/arm/mm/kasan_init.c @@ -226,7 +226,7 @@ void __init kasan_init(void) BUILD_BUG_ON(pgd_index(KASAN_SHADOW_START) != pgd_index(KASAN_SHADOW_END)); memcpy(tmp_pmd_table, - pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_START)), + (void*)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_START)), sizeof(tmp_pmd_table)); set_pgd(&tmp_pgd_table[pgd_index(KASAN_SHADOW_START)], __pgd(__pa(tmp_pmd_table) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I50PPU CVE: NA
-----------------------------------------------------------------
when unthrottle a cfs_rq at distribute_cfs_runtime(), another cpu may re-throttle this cfs_rq at qos_throttle_cfs_rq() before access the cfs_rq->throttle_list.next, but meanwhile, qos throttle will attach the cfs_rq throttle_list node to percpu qos_throttled_cfs_rq, it will change cfs_rq->throttle_list.next and cause panic or hardlockup at distribute_cfs_runtime().
Fix it by adding a qos_throttle_list node in struct cfs_rq, and qos throttle disuse the cfs_rq->throttle_list.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: zheng zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 10 +++++++--- kernel/sched/sched.h | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 583b5dcbf61f..26ea78ae3cca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5385,6 +5385,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_QOS_SCHED + INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); +#endif }
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -7204,7 +7207,8 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq);
- list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); + list_add(&cfs_rq->qos_throttled_list, + &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); }
static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) @@ -7223,7 +7227,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq);
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; - list_del_init(&cfs_rq->throttled_list); + list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -7266,7 +7270,7 @@ static int __unthrottle_qos_cfs_rqs(int cpu) int res = 0;
list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), - throttled_list) { + qos_throttled_list) { if (cfs_rq_throttled(cfs_rq)) { unthrottle_qos_cfs_rq(cfs_rq); res++; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d122f1b8e3e6..fadd38187c2a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -626,8 +626,12 @@ struct cfs_rq { #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */
+#if !defined(__GENKSYMS__) && defined(CONFIG_QOS_SCHED) + struct list_head qos_throttled_list; +#else KABI_RESERVE(1) KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) };