Binder Makin (1): cgroups: Resource controller for open files
Hou Tao (1): cgroup/files: use task_get_css() to get a valid css during dup_fd()
Lu Jialin (2): fs: fix files.usage bug when move tasks fs/filescontrol.c: fix warning:large integer implicitly truncated to unsigned type
Wenkai Lin (1): iommu/arm-smmu-v3: disable stall for quiet_cd
Yang Yingliang (1): cgroup/files: support boot parameter to control if disable files cgroup
Yu Kuai (1): fs/filescontrol: add a switch to enable / disable accounting of open fds
Zhang Xiaoxu (2): files_cgroup: fix error pointer when kvm_vm_worker_thread files_cgroup: Fix soft lockup when refcnt overflow.
zhangyi (F) (1): filescontrol: silence suspicious RCU warning
From: Wenkai Lin linwenkai6@hisilicon.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8PSC6
Reference: https://patchwork.kernel.org/project/kvm/patch/20231206005727.46150-1-zhangf...
----------------------------------------------
In the stall model, invalid transactions were expected to be stalled and aborted by the IOPF handler.
However, when killing a test case with a huge amount of data, the accelerator streamline can not stop until all data is consumed even if the page fault handler reports errors. As a result, the kill may take a long time, about 10 seconds with numerous iopf interrupts.
So disable stall for quiet_cd in the non-force stall model, since force stall model (STALL_MODEL==0b10) requires CD.S must be 1.
Signed-off-by: Zhangfei Gao zhangfei.gao@linaro.org Signed-off-by: Wenkai Lin linwenkai6@hisilicon.com Suggested-by: Jean-Philippe Brucker jean-philippe@linaro.org Reviewed-by: Jason Gunthorpe jgg@nvidia.com Reviewed-by: Jean-Philippe Brucker jean-philippe@linaro.org Link: https://lore.kernel.org/r/20231206005727.46150-1-zhangfei.gao@linaro.org Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Zhang Zekun zhangzekun11@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 4d417b4243eb..5055a66644af 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1070,6 +1070,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, u64 val; bool cd_live; __le64 *cdptr; + struct arm_smmu_device *smmu = smmu_domain->smmu;
if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg.s1cdmax))) return -E2BIG; @@ -1084,6 +1085,8 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid, if (!cd) { /* (5) */ val = 0; } else if (cd == &quiet_cd) { /* (4) */ + if (!(smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) + val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R); val |= CTXDESC_CD_0_TCR_EPD0; } else if (cd_live) { /* (3) */ val &= ~CTXDESC_CD_0_ASID;
From: Binder Makin merimus@google.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
Add a lockless resource controller for limiting the number of open file handles. This allows us to catch misbehaving processes and return EMFILE instead of ENOMEM for kernel memory limits.
Original link: https://lwn.net/Articles/604129/.After introduced https://gitlab.indel.ch/thirdparty/linux-indel/commit /5b1efc027c0b51ca3e76f4e00c83358f8349f543. All memory accounting and limiting has been switched over to the lockless page counters. So we convert original resource counters to lockless page counters.
Signed-off-by: Binder Makin merimus@google.com Signed-off-by: chenridong chenridong@huawei.com --- fs/Makefile | 1 + fs/file.c | 53 +++++- fs/filescontrol.c | 321 ++++++++++++++++++++++++++++++++++ include/linux/cgroup-defs.h | 8 +- include/linux/cgroup.h | 6 + include/linux/cgroup_subsys.h | 4 + include/linux/fdtable.h | 1 + include/linux/filescontrol.h | 34 ++++ init/Kconfig | 10 ++ 9 files changed, 433 insertions(+), 5 deletions(-) create mode 100644 fs/filescontrol.c create mode 100644 include/linux/filescontrol.h
diff --git a/fs/Makefile b/fs/Makefile index f9541f40be4e..bb6164200a39 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o
obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_CGROUP_FILES) += filescontrol.o obj-y += iomap/
obj-y += quota/ diff --git a/fs/file.c b/fs/file.c index 3e4a4dfa38fc..c6565d998f8c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> +#include <linux/filescontrol.h> #include <linux/close_range.h> #include <net/sock.h>
@@ -337,6 +338,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; +#ifdef CONFIG_CGROUP_FILES + files_cgroup_assign(newf); +#endif
spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -400,10 +404,29 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
rcu_assign_pointer(newf->fdt, new_fdt); +#ifdef CONFIG_CGROUP_FILES + if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) + return newf; + +/* could not get enough FD resources. Need to clean up. */ + new_fds = new_fdt->fd; + for (i = open_files; i != 0; i--) { + struct file *f = *new_fds++;
+ if (f) + fput(f); + } + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); + *errorp = -EMFILE; +#else return newf; +#endif
out_release: +#ifdef CONFIG_CGROUP_FILES + files_cgroup_remove(newf); +#endif kmem_cache_free(files_cachep, newf); out: return NULL; @@ -429,6 +452,9 @@ static struct fdtable *close_files(struct files_struct * files) if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { +#ifdef CONFIG_CGROUP_FILES + files_cgroup_unalloc_fd(files, 1); +#endif filp_close(file, files); cond_resched(); } @@ -531,6 +557,12 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags) */ if (error) goto repeat; +#ifdef CONFIG_CGROUP_FILES + if (files_cgroup_alloc_fd(files, 1)) { + error = -EMFILE; + goto out; + } +#endif
if (start <= files->next_fd) files->next_fd = fd + 1; @@ -568,6 +600,10 @@ EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); +#ifdef CONFIG_CGROUP_FILES + if (test_bit(fd, fdt->open_fds)) + files_cgroup_unalloc_fd(files, 1); +#endif __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; @@ -1106,6 +1142,7 @@ static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { + int err; struct file *tofree; struct fdtable *fdt;
@@ -1125,8 +1162,16 @@ __releases(&files->file_lock) */ fdt = files_fdtable(files); tofree = fdt->fd[fd]; - if (!tofree && fd_is_open(fd, fdt)) - goto Ebusy; + if (!tofree && fd_is_open(fd, fdt)) { + err = -EBUSY; + goto out; + } +#ifdef CONFIG_CGROUP_FILES + if (!tofree && files_cgroup_alloc_fd(files, 1)) { + err = -EMFILE; + goto out; + } +#endif get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); @@ -1141,9 +1186,9 @@ __releases(&files->file_lock)
return fd;
-Ebusy: +out: spin_unlock(&files->file_lock); - return -EBUSY; + return err; }
int replace_fd(unsigned fd, struct file *file, unsigned flags) diff --git a/fs/filescontrol.c b/fs/filescontrol.c new file mode 100644 index 000000000000..44ad9ef44e20 --- /dev/null +++ b/fs/filescontrol.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0 +/* filescontrol.c - Cgroup controller for open file handles. + * + * Copyright 2014 Google Inc. + * Author: Brian Makin merimus@google.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/page_counter.h> +#include <linux/filescontrol.h> +#include <linux/cgroup.h> +#include <linux/export.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/fdtable.h> +#include <linux/sched/signal.h> + +#define FILES_MAX ULLONG_MAX +#define FILES_MAX_STR "max" + + +struct cgroup_subsys files_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(files_cgrp_subsys); + +struct files_cgroup { + struct cgroup_subsys_state css; + struct page_counter open_handles; +}; + +static inline struct files_cgroup *css_fcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct files_cgroup, css) : NULL; +} + +static inline struct page_counter * +css_res_open_handles(struct cgroup_subsys_state *css) +{ + return &css_fcg(css)->open_handles; +} + +static inline struct files_cgroup * +files_cgroup_from_files(struct files_struct *files) +{ + return files->files_cgroup; +} + + +static struct cgroup_subsys_state * +files_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct files_cgroup *parent_fcg; + struct files_cgroup *fcg; + + parent_fcg = css_fcg(parent_css); + fcg = kzalloc(sizeof(*fcg), GFP_KERNEL); + if (!fcg) + goto out; + + if (!parent_fcg) { + page_counter_init(&fcg->open_handles, NULL); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } else { + struct page_counter *p_counter = &parent_fcg->open_handles; + + page_counter_init(&fcg->open_handles, p_counter); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } + return &fcg->css; + +out: + return ERR_PTR(-ENOMEM); +} + +static void files_cgroup_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_fcg(css)); +} + +u64 files_cgroup_count_fds(struct files_struct *files) +{ + int i; + struct fdtable *fdt; + int retval = 0; + + fdt = files_fdtable(files); + for (i = 0; i < DIV_ROUND_UP(fdt->max_fds, BITS_PER_LONG); i++) + retval += hweight64((__u64)fdt->open_fds[i]); + return retval; +} + +static u64 files_in_taskset(struct cgroup_taskset *tset) +{ + struct task_struct *task; + u64 files = 0; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + if (!thread_group_leader(task)) + continue; + + task_lock(task); + files += files_cgroup_count_fds(task->files); + task_unlock(task); + } + return files; +} + +/* + * If attaching this cgroup would overcommit the resource then deny + * the attach. + */ +static int files_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + unsigned long margin; + struct page_counter *cnt; + unsigned long counter; + u64 files = files_in_taskset(tset); + + cgroup_taskset_first(tset, &css); + cnt = css_res_open_handles(css); + + counter = (unsigned long)atomic_long_read(&cnt->usage); + if (cnt->max > counter) + margin = cnt->max - counter; + else + margin = 0; + if (margin < files) + return -ENOMEM; + return 0; +} + +/* + * If resource counts have gone up between can_attach and attach then + * this may overcommit resources. In that case just deny further allocation + * until the resource usage drops. + */ +static void files_cgroup_attach(struct cgroup_taskset *tset) +{ + u64 num_files; + struct cgroup_subsys_state *to_css; + struct cgroup_subsys_state *from_css; + struct page_counter *from_res; + struct page_counter *to_res; + struct page_counter *fail_res; + struct files_struct *files; + struct task_struct *task = cgroup_taskset_first(tset, &to_css); + + to_res = css_res_open_handles(to_css); + + task_lock(task); + files = task->files; + if (!files) { + task_unlock(task); + return; + } + + from_css = &files_cgroup_from_files(files)->css; + from_res = css_res_open_handles(from_css); + + spin_lock(&files->file_lock); + num_files = files_cgroup_count_fds(files); + page_counter_uncharge(from_res, num_files); + css_put(from_css); + + if (!page_counter_try_charge(to_res, num_files, &fail_res)) + pr_err("Open files limit overcommited\n"); + css_get(to_css); + task->files->files_cgroup = css_fcg(to_css); + spin_unlock(&files->file_lock); + task_unlock(task); +} + +int files_cgroup_alloc_fd(struct files_struct *files, u64 n) +{ + /* + * Kernel threads which are forked by kthreadd inherited the + * const files_struct 'init_files', we didn't wrap it so + * there's no associated files_cgroup. + * + * Kernel threads always stay in root cgroup, and we don't + * have limit for root files cgroup, so it won't hurt if + * we don't charge their fds, only issue is that files.usage + * won't be accurate in root files cgroup. + */ + if (files != &init_files) { + struct page_counter *fail_res; + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + if (!page_counter_try_charge(&files_cgroup->open_handles, + n, &fail_res)) + return -ENOMEM; + } + return 0; +} +EXPORT_SYMBOL(files_cgroup_alloc_fd); + +void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) +{ + /* + * It's not charged so no need to uncharge, see comments in + * files_cgroup_alloc_fd. + */ + if (files != &init_files) { + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + page_counter_uncharge(&files_cgroup->open_handles, n); + } +} +EXPORT_SYMBOL(files_cgroup_unalloc_fd); + + +static int files_limit_read(struct seq_file *sf, void *v) +{ + struct files_cgroup *fcg = css_fcg(seq_css(sf)); + struct page_counter *counter = &fcg->open_handles; + u64 limit = counter->max; + + if (limit >= FILES_MAX) + seq_printf(sf, "%s\n", FILES_MAX_STR); + else + seq_printf(sf, "%llu\n", limit); + + return 0; +} + +static ssize_t files_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct files_cgroup *fcg = css_fcg(of_css(of)); + u64 limit; + int err; + + buf = strstrip((char *)buf); + if (!strcmp(buf, FILES_MAX_STR)) { + limit = FILES_MAX; + goto set_limit; + } + + err = kstrtoull(buf, 0, &limit); + if (err) + return err; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + page_counter_set_max(&fcg->open_handles, limit); + return nbytes; +} + + +static u64 files_usage_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct files_cgroup *fcg = css_fcg(css); + + return page_counter_read(&fcg->open_handles); +} + +static struct cftype files[] = { + { + .name = "limit", + .seq_show = files_limit_read, + .write = files_limit_write, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "usage", + .read_u64 = files_usage_read, + }, + { } +}; + +struct cgroup_subsys files_cgrp_subsys = { + .css_alloc = files_cgroup_css_alloc, + .css_free = files_cgroup_css_free, + .can_attach = files_cgroup_can_attach, + .attach = files_cgroup_attach, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; + +void files_cgroup_assign(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + + task_lock(tsk); + cgrp = task_cgroup(tsk, files_cgrp_id); + css = cgroup_subsys_state(cgrp, files_cgrp_id); + css_get(css); + files->files_cgroup = container_of(css, struct files_cgroup, css); + task_unlock(tsk); +} + +void files_cgroup_remove(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_cgroup *fcg; + + task_lock(tsk); + spin_lock(&files->file_lock); + fcg = files_cgroup_from_files(files); + css_put(&fcg->css); + spin_unlock(&files->file_lock); + task_unlock(tsk); +} diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b..ad632b340c7b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -653,6 +653,12 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off);
+ int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, + struct seq_file *m); + + int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + const char *buffer); + __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt);
@@ -742,7 +748,7 @@ struct cgroup_subsys { */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ - + struct cftype *base_cftypes; /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7fa51b600ee8..1cb90ea3299a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -362,6 +362,12 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); }
+static inline struct cgroup_subsys_state *cgroup_subsys_state( + struct cgroup *cgrp, int subsys_id) +{ + return cgrp->subsys[subsys_id]; +} + extern struct mutex cgroup_mutex;
static inline void cgroup_lock(void) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 445235487230..85fa78049bd0 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -72,6 +72,10 @@ SUBSYS(misc) SUBSYS(debug) #endif
+#if IS_ENABLED(CONFIG_CGROUP_FILES) +SUBSYS(files) +#endif + /* * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519..22b8b03fef6d 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -65,6 +65,7 @@ struct files_struct { unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; + struct files_cgroup *files_cgroup; };
struct file_operations; diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h new file mode 100644 index 000000000000..49dc620cf64e --- /dev/null +++ b/include/linux/filescontrol.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* filescontrol.h - Files Controller + * + * Copyright 2014 Google Inc. + * Author: Brian Makin merimus@google.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_FILESCONTROL_H +#define _LINUX_FILESCONTROL_H + +#include <linux/fdtable.h> + +#ifdef CONFIG_CGROUP_FILES + +extern int files_cgroup_alloc_fd(struct files_struct *files, u64 n); +extern void files_cgroup_unalloc_fd(struct files_struct *files, u64 n); +extern u64 files_cgroup_count_fds(struct files_struct *files); +extern struct files_struct init_files; + +void files_cgroup_assign(struct files_struct *files); +void files_cgroup_remove(struct files_struct *files); + +#endif /* CONFIG_CGROUP_FILES */ +#endif /* _LINUX_FILESCONTROL_H */ diff --git a/init/Kconfig b/init/Kconfig index 2ee1384c4f81..418f94fb43b1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1197,6 +1197,16 @@ config CGROUP_V1_KILL default n depends on CGROUPS
+config CGROUP_FILES + bool "Files Resource Controller for Control Groups" + select PAGE_COUNTER + default n + help + Provides a cgroup resource controller that limits number of open + file handles within a cgroup. + This supports catching misbehaving processes and + return EMFILE instead of ENOMEM for kernel memory limits. + endif # CGROUPS
menuconfig NAMESPACES
From: Hou Tao houtao1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
Process fork and cgroup migration can happen simultaneously, and in the following case use-after-free of css_set is possible:
CPU 0: process fork CPU 1: cgroup migration
dup_fd __cgroup1_procs_write(threadgroup=false) files_cgroup_assign // task A task_lock task_cgroup(current, files_cgrp_id) css_set = task_css_set_check()
cgroup_migrate_execute files_cgroup_can_attach css_set_move_task put_css_set_locked() files_cgroup_attach // task B which is in the same // thread group as task A task_lock cgroup_migrate_finish // the css_set will be freed put_css_set_locked()
// use-after-free css_set->subsys[files_cgrp_id]
Fix it by using task_get_css() instead to get a valid css.
Fixes: 52cc1eccf6de ("cgroups: Resource controller for open files") Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- fs/filescontrol.c | 12 +++++------- include/linux/cgroup.h | 6 ------ 2 files changed, 5 insertions(+), 13 deletions(-)
diff --git a/fs/filescontrol.c b/fs/filescontrol.c index 44ad9ef44e20..1d2d29127fd4 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -293,18 +293,16 @@ struct cgroup_subsys files_cgrp_subsys = { .dfl_cftypes = files, };
+/* + * It could race against cgroup migration of current task, and + * using task_get_css() to get a valid css. + */ void files_cgroup_assign(struct files_struct *files) { - struct task_struct *tsk = current; struct cgroup_subsys_state *css; - struct cgroup *cgrp;
- task_lock(tsk); - cgrp = task_cgroup(tsk, files_cgrp_id); - css = cgroup_subsys_state(cgrp, files_cgrp_id); - css_get(css); + css = task_get_css(current, files_cgrp_id); files->files_cgroup = container_of(css, struct files_cgroup, css); - task_unlock(tsk); }
void files_cgroup_remove(struct files_struct *files) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1cb90ea3299a..7fa51b600ee8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -362,12 +362,6 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); }
-static inline struct cgroup_subsys_state *cgroup_subsys_state( - struct cgroup *cgrp, int subsys_id) -{ - return cgrp->subsys[subsys_id]; -} - extern struct mutex cgroup_mutex;
static inline void cgroup_lock(void)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
Such switch can only set the accounting of open fds in filescontrol from enable to disable. If it is disabled arealdy, the switch can't enable it.
The counter is enabled by default, and it can be disabled by: a. echo 1 > /sys/fs/cgroup/files/files.no_acct b. add "filescontrol.no_acct=1" to boot cmd
Signed-off-by: Yu Kuai yukuai3@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- fs/filescontrol.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/fs/filescontrol.c b/fs/filescontrol.c index 1d2d29127fd4..a24be705f621 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -25,14 +25,17 @@ #include <linux/seq_file.h> #include <linux/fdtable.h> #include <linux/sched/signal.h> +#include <linux/module.h>
#define FILES_MAX ULLONG_MAX #define FILES_MAX_STR "max"
- +static bool no_acct; struct cgroup_subsys files_cgrp_subsys __read_mostly; EXPORT_SYMBOL(files_cgrp_subsys);
+module_param(no_acct, bool, 0444); + struct files_cgroup { struct cgroup_subsys_state css; struct page_counter open_handles; @@ -194,7 +197,7 @@ int files_cgroup_alloc_fd(struct files_struct *files, u64 n) * we don't charge their fds, only issue is that files.usage * won't be accurate in root files cgroup. */ - if (files != &init_files) { + if (!no_acct && files != &init_files) { struct page_counter *fail_res; struct files_cgroup *files_cgroup = files_cgroup_from_files(files); @@ -212,7 +215,7 @@ void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) * It's not charged so no need to uncharge, see comments in * files_cgroup_alloc_fd. */ - if (files != &init_files) { + if (!no_acct && files != &init_files) { struct files_cgroup *files_cgroup = files_cgroup_from_files(files); page_counter_uncharge(&files_cgroup->open_handles, n); @@ -220,6 +223,21 @@ void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) } EXPORT_SYMBOL(files_cgroup_unalloc_fd);
+static u64 files_disabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return no_acct; +} + +static int files_disabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (!val) + return -EINVAL; + no_acct = true; + + return 0; +}
static int files_limit_read(struct seq_file *sf, void *v) { @@ -281,6 +299,12 @@ static struct cftype files[] = { .name = "usage", .read_u64 = files_usage_read, }, + { + .name = "no_acct", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = files_disabled_read, + .write_u64 = files_disabled_write, + }, { } };
From: Zhang Xiaoxu zhangxiaoxu5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
When fix CVE-2018-12207, the kvm_vm_worker_thread will attach all cgroup subsystem. But the files cgroup doesn't support kernel thread.
Because the init_files doesn't init the files cgroup, when kernel thread 'kvm_vm_worker_thread' attach the files cgroup, the files_cgroup get from 'init_files' is an error pointer. It lead the kernel panic as below: [ 724.842302] page_counter_uncharge+0x1d/0x30 [ 724.842431] files_cgroup_attach+0x7c/0x130 [ 724.842564] ? css_set_move_task+0x12e/0x230 [ 724.842694] cgroup_migrate_execute+0x2f9/0x3b0 [ 724.842833] cgroup_attach_task+0x156/0x200 [ 724.843010] ? kvm_mmu_pte_write+0x490/0x490 [kvm] [ 724.843153] cgroup_attach_task_all+0x81/0xd0 [ 724.843289] ? __schedule+0x294/0x910 [ 724.843419] kvm_vm_worker_thread+0x4a/0xc0 [kvm] [ 724.843579] ? kvm_exit+0x80/0x80 [kvm] [ 724.843690] kthread+0x112/0x130 [ 724.843792] ?kthread_create_worker_on_cpu+0x70/0x70 [ 724.843948] ret_from_fork+0x35/0x40
So, we add some check, if the task is kernel thread (files is 'init_files'), we doesn't do the more operation about the files cgroup.
Fixes: baa10bc24e1e ("kvm: Add helper function for creating VM ...") Signed-off-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- fs/filescontrol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/filescontrol.c b/fs/filescontrol.c index a24be705f621..db0beee474b2 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -164,7 +164,7 @@ static void files_cgroup_attach(struct cgroup_taskset *tset)
task_lock(task); files = task->files; - if (!files) { + if (!files || files == &init_files) { task_unlock(task); return; } @@ -325,6 +325,9 @@ void files_cgroup_assign(struct files_struct *files) { struct cgroup_subsys_state *css;
+ if (files == &init_files) + return; + css = task_get_css(current, files_cgrp_id); files->files_cgroup = container_of(css, struct files_cgroup, css); } @@ -334,6 +337,9 @@ void files_cgroup_remove(struct files_struct *files) struct task_struct *tsk = current; struct files_cgroup *fcg;
+ if (files == &init_files) + return; + task_lock(tsk); spin_lock(&files->file_lock); fcg = files_cgroup_from_files(files);
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
If parent cgroup files.limit is 0, fail to move a task into child cgroup. When kill the task, the files.usage of parent cgroup and child cgroup is abnormal.
/sys/fs/cgroup/parent # ls cgroup.clone_children files.limit tasks cgroup.procs files.usage child notify_on_release /sys/fs/cgroup/parent # echo 0 >files.limit /sys/fs/cgroup/parent # cd child /sys/fs/cgroup/parent/child # ls cgroup.clone_children files.limit notify_on_release cgroup.procs files.usage tasks /sys/fs/cgroup/parent/child # echo 156 >tasks [ 879.564728] Open files limit overcommited /sys/fs/cgroup/parent/child # kill -9 156 /sys/fs/cgroup/parent/child # [ 886.363690] WARNING: CPU: 0 PID: 156 at mm/page_counter.c:62 page_counter_cancel+0x26/0x30 [ 886.364093] Modules linked in: [ 886.364093] CPU: 0 PID: 156 Comm: top Not tainted 4.18.0+ #1 [ 886.364093] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 886.365350] RIP: 0010:page_counter_cancel+0x26/0x30 [ 886.365350] Code: 0f 1f 40 00 66 66 66 66 90 48 89 f0 53 48 f7 d8 f0 48 0f c1 07 48 29 f0 48 89 c3 48 89 c6 e8 61 ff ff ff 48 85 d5 [ 886.365350] RSP: 0018:ffffb754006b7d00 EFLAGS: 00000286 [ 886.365350] RAX: 0000000000000000 RBX: ffffffffffffffff RCX: 0000000000000001 [ 886.365350] RDX: 0000000000000000 RSI: ffffffffffffffff RDI: ffff9ca61888b930 [ 886.365350] RBP: 0000000000000001 R08: 00000000000295c0 R09: ffffffff820597aa [ 886.365350] R10: ffffffffffffffff R11: ffffd78601823508 R12: 0000000000000000 [ 886.365350] R13: ffff9ca6181c0628 R14: 0000000000000000 R15: ffff9ca663e9d000 [ 886.365350] FS: 0000000000000000(0000) GS:ffff9ca661e00000(0000) knlGS:0000000000000000 [ 886.365350] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 886.365350] CR2: 0000000000867fb8 CR3: 0000000017a0a000 CR4: 00000000000006f0 [ 886.365350] Call Trace: [ 886.369392] page_counter_uncharge+0x1d/0x30 [ 886.369392] put_files_struct+0x7c/0xe0 [ 886.369392] do_exit+0x2c7/0xb90 [ 886.369392] ? __schedule+0x2a1/0x900 [ 886.369392] do_group_exit+0x3a/0xa0 [ 886.369392] get_signal+0x15e/0x870 [ 886.369392] do_signal+0x36/0x610 [ 886.369392] ? do_vfs_ioctl+0xa4/0x640 [ 886.369392] ? do_vfs_ioctl+0xa4/0x640 [ 886.369392] ? dput+0x29/0x110 [ 886.369392] exit_to_usermode_loop+0x71/0xe0 [ 886.369392] do_syscall_64+0x181/0x1b0 [ 886.369392] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 886.369392] RIP: 0033:0x4b9b5a [ 886.369392] Code: Bad RIP value. [ 886.369392] RSP: 002b:00007ffe27221968 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 [ 886.373373] RAX: fffffffffffffe00 RBX: 0000000000000001 RCX: 00000000004b9b5a [ 886.373373] RDX: 00007ffe27221930 RSI: 0000000000005402 RDI: 0000000000000000 [ 886.373373] RBP: 0000000000000135 R08: 00007ffe272219a4 R09: 0000000000000010 [ 886.373373] R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 [ 886.373373] R13: 0000000000000005 R14: 0000000000000135 R15: 0000000000000000 [ 886.373373] ---[ end trace 56c4971a753a98c5 ]---
[1]+ Killed top /sys/fs/cgroup/parent/child # ls cgroup.clone_children files.limit notify_on_release cgroup.procs files.usage tasks /sys/fs/cgroup/parent/child # cat files.usage 18446744073709551613 /sys/fs/cgroup/parent/child # cd .. /sys/fs/cgroup/parent # ls cgroup.clone_children files.limit tasks cgroup.procs files.usage child notify_on_release /sys/fs/cgroup/parent # cat files.usage 18446744073709551613
The reason is when fail to move a task into child cgroup,the files.usage of child cgroup and its parent cgroup are the same as before. The struct files_cgroup points to the dst_css. Therefore, when kill the task, the page_counter_uncharge() will subtract the files.usage of child cgroup and its parent cgroup again. The files.usage will be abnormal.
If we just change the struct files_cgroup pointers when charge success in files_cgroup_attach, problems will occur in some extreme scenario. 1)If we add num_files into original page_counter when fail to charge the file resource into new cgroup, the files.usage will be larger than files.limit of the original cgroup when new task moves into the original cgroup at the same time. 2)If we subtract num_files into original page_counter when success to charge the file resource into new cgroup, when the parent files.limit equals to the files.usage and there are two child cgroups of the parent, it will be failed to move the task from one child cgroup into another child cgroup.
The patch implements files_cgroup_attach() into files_cgroup_can_attach() and delete files_cgroup_attach(). This will make move file related resource into new cgroup before move task. When try_charge is failed, task and its file resource will be in the original cgroup.The above problems will be solved.
Signed-off-by: Lu Jialin lujialin4@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- fs/filescontrol.c | 63 ++++++++++------------------------------------- 1 file changed, 13 insertions(+), 50 deletions(-)
diff --git a/fs/filescontrol.c b/fs/filescontrol.c index db0beee474b2..41abe29fc0f8 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -102,56 +102,14 @@ u64 files_cgroup_count_fds(struct files_struct *files) return retval; }
-static u64 files_in_taskset(struct cgroup_taskset *tset) -{ - struct task_struct *task; - u64 files = 0; - struct cgroup_subsys_state *css; - - cgroup_taskset_for_each(task, css, tset) { - if (!thread_group_leader(task)) - continue; - - task_lock(task); - files += files_cgroup_count_fds(task->files); - task_unlock(task); - } - return files; -} - /* * If attaching this cgroup would overcommit the resource then deny - * the attach. + * the attach. If not, attach the file resource into new cgroup. */ static int files_cgroup_can_attach(struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - unsigned long margin; - struct page_counter *cnt; - unsigned long counter; - u64 files = files_in_taskset(tset); - - cgroup_taskset_first(tset, &css); - cnt = css_res_open_handles(css); - - counter = (unsigned long)atomic_long_read(&cnt->usage); - if (cnt->max > counter) - margin = cnt->max - counter; - else - margin = 0; - if (margin < files) - return -ENOMEM; - return 0; -} - -/* - * If resource counts have gone up between can_attach and attach then - * this may overcommit resources. In that case just deny further allocation - * until the resource usage drops. - */ -static void files_cgroup_attach(struct cgroup_taskset *tset) { u64 num_files; + bool can_attach; struct cgroup_subsys_state *to_css; struct cgroup_subsys_state *from_css; struct page_counter *from_res; @@ -166,7 +124,7 @@ static void files_cgroup_attach(struct cgroup_taskset *tset) files = task->files; if (!files || files == &init_files) { task_unlock(task); - return; + return 0; }
from_css = &files_cgroup_from_files(files)->css; @@ -175,14 +133,20 @@ static void files_cgroup_attach(struct cgroup_taskset *tset) spin_lock(&files->file_lock); num_files = files_cgroup_count_fds(files); page_counter_uncharge(from_res, num_files); - css_put(from_css);
- if (!page_counter_try_charge(to_res, num_files, &fail_res)) + if (!page_counter_try_charge(to_res, num_files, &fail_res)) { + page_counter_charge(from_res, num_files); pr_err("Open files limit overcommited\n"); - css_get(to_css); - task->files->files_cgroup = css_fcg(to_css); + can_attach = false; + } else { + css_put(from_css); + css_get(to_css); + task->files->files_cgroup = css_fcg(to_css); + can_attach = true; + } spin_unlock(&files->file_lock); task_unlock(task); + return can_attach ? 0 : -ENOSPC; }
int files_cgroup_alloc_fd(struct files_struct *files, u64 n) @@ -312,7 +276,6 @@ struct cgroup_subsys files_cgrp_subsys = { .css_alloc = files_cgroup_css_alloc, .css_free = files_cgroup_css_free, .can_attach = files_cgroup_can_attach, - .attach = files_cgroup_attach, .legacy_cftypes = files, .dfl_cftypes = files, };
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) the nr_pages is unsigned long, therefore change FILES_MAX to ULONG_MAX
Signed-off-by: Lu Jialin lujialin4@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- fs/filescontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/filescontrol.c b/fs/filescontrol.c index 41abe29fc0f8..4ad500f40025 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -27,7 +27,7 @@ #include <linux/sched/signal.h> #include <linux/module.h>
-#define FILES_MAX ULLONG_MAX +#define FILES_MAX ULONG_MAX #define FILES_MAX_STR "max"
static bool no_acct;
From: Zhang Xiaoxu zhangxiaoxu5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
---------------------
There is a soft lockup call trace as below: CPU: 0 PID: 1360 Comm: imapsvcd Kdump: loaded Tainted: G OE task: ffff8a7296e1eeb0 ti: ffff8a7296aa0000 task.ti: ffff8a7296aa0000 RIP: 0010:[<ffffffffb691ecb4>] [<ffffffffb691ecb4>] __css_tryget+0x24/0x50 RSP: 0018:ffff8a7296aa3db8 EFLAGS: 00000a87 RAX: 0000000080000000 RBX: ffff8a7296aa3df8 RCX: ffff8a72820d9a08 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8a72820d9a00 RBP: ffff8a7296aa3db8 R08: 000000000001c360 R09: ffffffffb6a478f4 R10: ffffffffb6935e83 R11: ffffffffffffffd0 R12: 0000000057d35cd8 R13: 000000d000000002 R14: ffffffffb6892fbe R15: 000000d000000002 FS: 0000000000000000(0000) GS:ffff8a72fec00000(0063) knlGS:00000000c6e65b40 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 0000000057d35cd8 CR3: 00000007e8008000 CR4: 00000000003607f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffb6a93578>] files_cgroup_assign+0x48/0x60 [<ffffffffb6a47972>] dup_fd+0xb2/0x2f0 [<ffffffffb6935e83>] ? audit_alloc+0xe3/0x180 [<ffffffffb6893a03>] copy_process+0xbd3/0x1a40 [<ffffffffb6894a21>] do_fork+0x91/0x320 [<ffffffffb6f329e6>] ? trace_do_page_fault+0x56/0x150 [<ffffffffb6894d36>] SyS_clone+0x16/0x20 [<ffffffffb6f3bf8c>] ia32_ptregs_common+0x4c/0xfc code: 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 8d 4f 08 48 89 e5 8b 47 08 8d 90 00 00 00 80 85 c0 0f 49 d0 8d 72 01 89 d0 f0 0f b1
When the child process exit, we doesn't call dec refcnt, so, the refcnt maybe overflow. Then the 'task_get_css' will dead loop because the 'css_refcnt' will return an unbias refcnt, if the refcnt is negitave, '__css_tryget' always return false, then 'task_get_css' dead looped.
The child process always call 'close_files' when exit, add dec refcnt in it.
Signed-off-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- fs/file.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/file.c b/fs/file.c index c6565d998f8c..b44712c62758 100644 --- a/fs/file.c +++ b/fs/file.c @@ -463,6 +463,9 @@ static struct fdtable *close_files(struct files_struct * files) set >>= 1; } } +#ifdef CONFIG_CGROUP_FILES + files_cgroup_remove(files); +#endif
return fdt; }
From: Yang Yingliang yangyingliang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
When files cgroup is enabled, it's will leads syscall performance regression in UnixBench. Add a helper files_cgroup_enabled() and use it to control if use files cgroup, wen can use cgroup_disable=files in cmdline to disable files cgroup.
syscall of UnixBench (large is better) enable files cgroup: 2868.5 disable files cgroup: 3177.0 disable config of files cgroup: 3186.5
Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- .../admin-guide/kernel-parameters.txt | 7 +++--- fs/file.c | 23 ++++++++++++------- include/linux/filescontrol.h | 6 +++++ 3 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 41644336e358..fa4775c9d6fc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -565,9 +565,10 @@ - if foo is an optional feature then the feature is disabled and corresponding cgroup files are not created - {Currently only "memory" controller deal with this and - cut the overhead, others just disable the usage. So - only cgroup_disable=memory is actually worthy} + {Currently only "memory" and and "files" controller + deal with this and cut the overhead, others just + disable the usage. So only cgroup_disable=memory and + cgroup_disable=files are actually worthy} Specifying "pressure" disables per-cgroup pressure stall information accounting feature
diff --git a/fs/file.c b/fs/file.c index b44712c62758..6c63342ef506 100644 --- a/fs/file.c +++ b/fs/file.c @@ -339,7 +339,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; #ifdef CONFIG_CGROUP_FILES - files_cgroup_assign(newf); + if (files_cgroup_enabled()) + files_cgroup_assign(newf); #endif
spin_lock(&oldf->file_lock); @@ -405,10 +406,12 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
rcu_assign_pointer(newf->fdt, new_fdt); #ifdef CONFIG_CGROUP_FILES + if (!files_cgroup_enabled()) + return newf; if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) return newf;
-/* could not get enough FD resources. Need to clean up. */ + /* could not get enough FD resources. Need to clean up. */ new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *new_fds++; @@ -425,7 +428,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
out_release: #ifdef CONFIG_CGROUP_FILES - files_cgroup_remove(newf); + if (files_cgroup_enabled()) + files_cgroup_remove(newf); #endif kmem_cache_free(files_cachep, newf); out: @@ -453,7 +457,8 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { #ifdef CONFIG_CGROUP_FILES - files_cgroup_unalloc_fd(files, 1); + if (files_cgroup_enabled()) + files_cgroup_unalloc_fd(files, 1); #endif filp_close(file, files); cond_resched(); @@ -464,7 +469,8 @@ static struct fdtable *close_files(struct files_struct * files) } } #ifdef CONFIG_CGROUP_FILES - files_cgroup_remove(files); + if (files_cgroup_enabled()) + files_cgroup_remove(files); #endif
return fdt; @@ -561,7 +567,7 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags) if (error) goto repeat; #ifdef CONFIG_CGROUP_FILES - if (files_cgroup_alloc_fd(files, 1)) { + if (files_cgroup_enabled() && files_cgroup_alloc_fd(files, 1)) { error = -EMFILE; goto out; } @@ -604,7 +610,7 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); #ifdef CONFIG_CGROUP_FILES - if (test_bit(fd, fdt->open_fds)) + if (files_cgroup_enabled() && test_bit(fd, fdt->open_fds)) files_cgroup_unalloc_fd(files, 1); #endif __clear_open_fd(fd, fdt); @@ -1170,7 +1176,8 @@ __releases(&files->file_lock) goto out; } #ifdef CONFIG_CGROUP_FILES - if (!tofree && files_cgroup_alloc_fd(files, 1)) { + if (files_cgroup_enabled() && + !tofree && files_cgroup_alloc_fd(files, 1)) { err = -EMFILE; goto out; } diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h index 49dc620cf64e..0182f145a339 100644 --- a/include/linux/filescontrol.h +++ b/include/linux/filescontrol.h @@ -19,6 +19,7 @@ #define _LINUX_FILESCONTROL_H
#include <linux/fdtable.h> +#include <linux/cgroup.h>
#ifdef CONFIG_CGROUP_FILES
@@ -30,5 +31,10 @@ extern struct files_struct init_files; void files_cgroup_assign(struct files_struct *files); void files_cgroup_remove(struct files_struct *files);
+static inline bool files_cgroup_enabled(void) +{ + return cgroup_subsys_enabled(files_cgrp_subsys); +} + #endif /* CONFIG_CGROUP_FILES */ #endif /* _LINUX_FILESCONTROL_H */
From: "zhangyi (F)" yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4G4S5
---------------------------
files_fdtable() in files_cgroup_count_fds() should be invoked under files_struct->file_lock, otherwise a suspicious RCU usage warning triggers below when CONFIG_PROVE_RCU and CONFIG_LOCKDEP are enabled.
============================= WARNING: suspicious RCU usage ... ----------------------------- fs/filescontrol.c:96 suspicious rcu_dereference_check() usage! ... stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.36-cph920-32bitc-vmalloc-binder-debugon.aarch64 #6 Call trace: dump_backtrace+0x0/0x198 show_stack+0x24/0x30 dump_stack+0xd0/0x11c lockdep_rcu_suspicious+0xcc/0x110 files_cgroup_count_fds+0xc0/0xe0 dup_fd+0x234/0x448 copy_process.isra.2.part.3+0x698/0x1490 _do_fork+0xe8/0x728 kernel_thread+0x48/0x58 rest_init+0x34/0x2a0 start_kernel+0x52c/0x558
Although the 'newf' is newly created and will not be released in paralle, still silence the warning through adding spin_lock around.
Fixes: 52cc1eccf6de ("cgroups: Resource controller for open files") Signed-off-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- fs/file.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/file.c b/fs/file.c index 6c63342ef506..a9558b693dba 100644 --- a/fs/file.c +++ b/fs/file.c @@ -408,8 +408,13 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int #ifdef CONFIG_CGROUP_FILES if (!files_cgroup_enabled()) return newf; - if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) + spin_lock(&newf->file_lock); + if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) { + spin_unlock(&newf->file_lock); return newf; + } + spin_unlock(&newf->file_lock); +
/* could not get enough FD resources. Need to clean up. */ new_fds = new_fdt->fd;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Q... 失败原因:应用补丁/补丁集失败,Patch failed at 0001 iommu/arm-smmu-v3: disable stall for quiet_cd 建议解决方法:请查看失败原因, 确认补丁是否可以应用在当前期望分支的最新代码上
FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Q... Failed Reason: apply patch(es) failed, Patch failed at 0001 iommu/arm-smmu-v3: disable stall for quiet_cd Suggest Solution: please checkout if the failed patch(es) can work on the newest codes in expected branch