Binder Makin (1): cgroups: Resource controller for open files
Hou Tao (1): cgroup/files: fix bugs as mentioned
Lu Jialin (1): enable CONFIG_CGROUP_FILES in openeuler_defconfig for x86 and arm64
.../admin-guide/kernel-parameters.txt | 7 +- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/Makefile | 1 + fs/file.c | 68 +++- fs/filescontrol.c | 312 ++++++++++++++++++ include/linux/cgroup-defs.h | 8 +- include/linux/cgroup_subsys.h | 4 + include/linux/fdtable.h | 1 + include/linux/filescontrol.h | 40 +++ init/Kconfig | 10 + 11 files changed, 445 insertions(+), 8 deletions(-) create mode 100644 fs/filescontrol.c create mode 100644 include/linux/filescontrol.h
From: Binder Makin merimus@google.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
Add a lockless resource controller for limiting the number of open file handles. This allows us to catch misbehaving processes and return EMFILE instead of ENOMEM for kernel memory limits.
Original link: https://lwn.net/Articles/604129/.After introduced https://gitlab.indel.ch/thirdparty/linux-indel/commit /5b1efc027c0b51ca3e76f4e00c83358f8349f543. All memory accounting and limiting has been switched over to the lockless page counters. So we convert original resource counters to lockless page counters.
Signed-off-by: Binder Makin merimus@google.com Signed-off-by: chenridong chenridong@huawei.com --- fs/Makefile | 1 + fs/file.c | 53 +++++- fs/filescontrol.c | 321 ++++++++++++++++++++++++++++++++++ include/linux/cgroup-defs.h | 8 +- include/linux/cgroup.h | 6 + include/linux/cgroup_subsys.h | 4 + include/linux/fdtable.h | 1 + include/linux/filescontrol.h | 34 ++++ init/Kconfig | 10 ++ 9 files changed, 433 insertions(+), 5 deletions(-) create mode 100644 fs/filescontrol.c create mode 100644 include/linux/filescontrol.h
diff --git a/fs/Makefile b/fs/Makefile index f9541f40be4e..bb6164200a39 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o
obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_CGROUP_FILES) += filescontrol.o obj-y += iomap/
obj-y += quota/ diff --git a/fs/file.c b/fs/file.c index 3e4a4dfa38fc..c6565d998f8c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> +#include <linux/filescontrol.h> #include <linux/close_range.h> #include <net/sock.h>
@@ -337,6 +338,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; +#ifdef CONFIG_CGROUP_FILES + files_cgroup_assign(newf); +#endif
spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -400,10 +404,29 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
rcu_assign_pointer(newf->fdt, new_fdt); +#ifdef CONFIG_CGROUP_FILES + if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) + return newf; + +/* could not get enough FD resources. Need to clean up. */ + new_fds = new_fdt->fd; + for (i = open_files; i != 0; i--) { + struct file *f = *new_fds++;
+ if (f) + fput(f); + } + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); + *errorp = -EMFILE; +#else return newf; +#endif
out_release: +#ifdef CONFIG_CGROUP_FILES + files_cgroup_remove(newf); +#endif kmem_cache_free(files_cachep, newf); out: return NULL; @@ -429,6 +452,9 @@ static struct fdtable *close_files(struct files_struct * files) if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { +#ifdef CONFIG_CGROUP_FILES + files_cgroup_unalloc_fd(files, 1); +#endif filp_close(file, files); cond_resched(); } @@ -531,6 +557,12 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags) */ if (error) goto repeat; +#ifdef CONFIG_CGROUP_FILES + if (files_cgroup_alloc_fd(files, 1)) { + error = -EMFILE; + goto out; + } +#endif
if (start <= files->next_fd) files->next_fd = fd + 1; @@ -568,6 +600,10 @@ EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); +#ifdef CONFIG_CGROUP_FILES + if (test_bit(fd, fdt->open_fds)) + files_cgroup_unalloc_fd(files, 1); +#endif __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; @@ -1106,6 +1142,7 @@ static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { + int err; struct file *tofree; struct fdtable *fdt;
@@ -1125,8 +1162,16 @@ __releases(&files->file_lock) */ fdt = files_fdtable(files); tofree = fdt->fd[fd]; - if (!tofree && fd_is_open(fd, fdt)) - goto Ebusy; + if (!tofree && fd_is_open(fd, fdt)) { + err = -EBUSY; + goto out; + } +#ifdef CONFIG_CGROUP_FILES + if (!tofree && files_cgroup_alloc_fd(files, 1)) { + err = -EMFILE; + goto out; + } +#endif get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); @@ -1141,9 +1186,9 @@ __releases(&files->file_lock)
return fd;
-Ebusy: +out: spin_unlock(&files->file_lock); - return -EBUSY; + return err; }
int replace_fd(unsigned fd, struct file *file, unsigned flags) diff --git a/fs/filescontrol.c b/fs/filescontrol.c new file mode 100644 index 000000000000..44ad9ef44e20 --- /dev/null +++ b/fs/filescontrol.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0 +/* filescontrol.c - Cgroup controller for open file handles. + * + * Copyright 2014 Google Inc. + * Author: Brian Makin merimus@google.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/page_counter.h> +#include <linux/filescontrol.h> +#include <linux/cgroup.h> +#include <linux/export.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/fdtable.h> +#include <linux/sched/signal.h> + +#define FILES_MAX ULLONG_MAX +#define FILES_MAX_STR "max" + + +struct cgroup_subsys files_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(files_cgrp_subsys); + +struct files_cgroup { + struct cgroup_subsys_state css; + struct page_counter open_handles; +}; + +static inline struct files_cgroup *css_fcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct files_cgroup, css) : NULL; +} + +static inline struct page_counter * +css_res_open_handles(struct cgroup_subsys_state *css) +{ + return &css_fcg(css)->open_handles; +} + +static inline struct files_cgroup * +files_cgroup_from_files(struct files_struct *files) +{ + return files->files_cgroup; +} + + +static struct cgroup_subsys_state * +files_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct files_cgroup *parent_fcg; + struct files_cgroup *fcg; + + parent_fcg = css_fcg(parent_css); + fcg = kzalloc(sizeof(*fcg), GFP_KERNEL); + if (!fcg) + goto out; + + if (!parent_fcg) { + page_counter_init(&fcg->open_handles, NULL); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } else { + struct page_counter *p_counter = &parent_fcg->open_handles; + + page_counter_init(&fcg->open_handles, p_counter); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } + return &fcg->css; + +out: + return ERR_PTR(-ENOMEM); +} + +static void files_cgroup_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_fcg(css)); +} + +u64 files_cgroup_count_fds(struct files_struct *files) +{ + int i; + struct fdtable *fdt; + int retval = 0; + + fdt = files_fdtable(files); + for (i = 0; i < DIV_ROUND_UP(fdt->max_fds, BITS_PER_LONG); i++) + retval += hweight64((__u64)fdt->open_fds[i]); + return retval; +} + +static u64 files_in_taskset(struct cgroup_taskset *tset) +{ + struct task_struct *task; + u64 files = 0; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + if (!thread_group_leader(task)) + continue; + + task_lock(task); + files += files_cgroup_count_fds(task->files); + task_unlock(task); + } + return files; +} + +/* + * If attaching this cgroup would overcommit the resource then deny + * the attach. + */ +static int files_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + unsigned long margin; + struct page_counter *cnt; + unsigned long counter; + u64 files = files_in_taskset(tset); + + cgroup_taskset_first(tset, &css); + cnt = css_res_open_handles(css); + + counter = (unsigned long)atomic_long_read(&cnt->usage); + if (cnt->max > counter) + margin = cnt->max - counter; + else + margin = 0; + if (margin < files) + return -ENOMEM; + return 0; +} + +/* + * If resource counts have gone up between can_attach and attach then + * this may overcommit resources. In that case just deny further allocation + * until the resource usage drops. + */ +static void files_cgroup_attach(struct cgroup_taskset *tset) +{ + u64 num_files; + struct cgroup_subsys_state *to_css; + struct cgroup_subsys_state *from_css; + struct page_counter *from_res; + struct page_counter *to_res; + struct page_counter *fail_res; + struct files_struct *files; + struct task_struct *task = cgroup_taskset_first(tset, &to_css); + + to_res = css_res_open_handles(to_css); + + task_lock(task); + files = task->files; + if (!files) { + task_unlock(task); + return; + } + + from_css = &files_cgroup_from_files(files)->css; + from_res = css_res_open_handles(from_css); + + spin_lock(&files->file_lock); + num_files = files_cgroup_count_fds(files); + page_counter_uncharge(from_res, num_files); + css_put(from_css); + + if (!page_counter_try_charge(to_res, num_files, &fail_res)) + pr_err("Open files limit overcommited\n"); + css_get(to_css); + task->files->files_cgroup = css_fcg(to_css); + spin_unlock(&files->file_lock); + task_unlock(task); +} + +int files_cgroup_alloc_fd(struct files_struct *files, u64 n) +{ + /* + * Kernel threads which are forked by kthreadd inherited the + * const files_struct 'init_files', we didn't wrap it so + * there's no associated files_cgroup. + * + * Kernel threads always stay in root cgroup, and we don't + * have limit for root files cgroup, so it won't hurt if + * we don't charge their fds, only issue is that files.usage + * won't be accurate in root files cgroup. + */ + if (files != &init_files) { + struct page_counter *fail_res; + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + if (!page_counter_try_charge(&files_cgroup->open_handles, + n, &fail_res)) + return -ENOMEM; + } + return 0; +} +EXPORT_SYMBOL(files_cgroup_alloc_fd); + +void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) +{ + /* + * It's not charged so no need to uncharge, see comments in + * files_cgroup_alloc_fd. + */ + if (files != &init_files) { + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + page_counter_uncharge(&files_cgroup->open_handles, n); + } +} +EXPORT_SYMBOL(files_cgroup_unalloc_fd); + + +static int files_limit_read(struct seq_file *sf, void *v) +{ + struct files_cgroup *fcg = css_fcg(seq_css(sf)); + struct page_counter *counter = &fcg->open_handles; + u64 limit = counter->max; + + if (limit >= FILES_MAX) + seq_printf(sf, "%s\n", FILES_MAX_STR); + else + seq_printf(sf, "%llu\n", limit); + + return 0; +} + +static ssize_t files_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct files_cgroup *fcg = css_fcg(of_css(of)); + u64 limit; + int err; + + buf = strstrip((char *)buf); + if (!strcmp(buf, FILES_MAX_STR)) { + limit = FILES_MAX; + goto set_limit; + } + + err = kstrtoull(buf, 0, &limit); + if (err) + return err; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + page_counter_set_max(&fcg->open_handles, limit); + return nbytes; +} + + +static u64 files_usage_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct files_cgroup *fcg = css_fcg(css); + + return page_counter_read(&fcg->open_handles); +} + +static struct cftype files[] = { + { + .name = "limit", + .seq_show = files_limit_read, + .write = files_limit_write, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "usage", + .read_u64 = files_usage_read, + }, + { } +}; + +struct cgroup_subsys files_cgrp_subsys = { + .css_alloc = files_cgroup_css_alloc, + .css_free = files_cgroup_css_free, + .can_attach = files_cgroup_can_attach, + .attach = files_cgroup_attach, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; + +void files_cgroup_assign(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + + task_lock(tsk); + cgrp = task_cgroup(tsk, files_cgrp_id); + css = cgroup_subsys_state(cgrp, files_cgrp_id); + css_get(css); + files->files_cgroup = container_of(css, struct files_cgroup, css); + task_unlock(tsk); +} + +void files_cgroup_remove(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_cgroup *fcg; + + task_lock(tsk); + spin_lock(&files->file_lock); + fcg = files_cgroup_from_files(files); + css_put(&fcg->css); + spin_unlock(&files->file_lock); + task_unlock(tsk); +} diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b..ad632b340c7b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -653,6 +653,12 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off);
+ int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, + struct seq_file *m); + + int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + const char *buffer); + __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt);
@@ -742,7 +748,7 @@ struct cgroup_subsys { */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ - + struct cftype *base_cftypes; /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7fa51b600ee8..1cb90ea3299a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -362,6 +362,12 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); }
+static inline struct cgroup_subsys_state *cgroup_subsys_state( + struct cgroup *cgrp, int subsys_id) +{ + return cgrp->subsys[subsys_id]; +} + extern struct mutex cgroup_mutex;
static inline void cgroup_lock(void) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 445235487230..85fa78049bd0 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -72,6 +72,10 @@ SUBSYS(misc) SUBSYS(debug) #endif
+#if IS_ENABLED(CONFIG_CGROUP_FILES) +SUBSYS(files) +#endif + /* * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519..22b8b03fef6d 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -65,6 +65,7 @@ struct files_struct { unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; + struct files_cgroup *files_cgroup; };
struct file_operations; diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h new file mode 100644 index 000000000000..49dc620cf64e --- /dev/null +++ b/include/linux/filescontrol.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* filescontrol.h - Files Controller + * + * Copyright 2014 Google Inc. + * Author: Brian Makin merimus@google.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_FILESCONTROL_H +#define _LINUX_FILESCONTROL_H + +#include <linux/fdtable.h> + +#ifdef CONFIG_CGROUP_FILES + +extern int files_cgroup_alloc_fd(struct files_struct *files, u64 n); +extern void files_cgroup_unalloc_fd(struct files_struct *files, u64 n); +extern u64 files_cgroup_count_fds(struct files_struct *files); +extern struct files_struct init_files; + +void files_cgroup_assign(struct files_struct *files); +void files_cgroup_remove(struct files_struct *files); + +#endif /* CONFIG_CGROUP_FILES */ +#endif /* _LINUX_FILESCONTROL_H */ diff --git a/init/Kconfig b/init/Kconfig index 2ee1384c4f81..418f94fb43b1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1197,6 +1197,16 @@ config CGROUP_V1_KILL default n depends on CGROUPS
+config CGROUP_FILES + bool "Files Resource Controller for Control Groups" + select PAGE_COUNTER + default n + help + Provides a cgroup resource controller that limits number of open + file handles within a cgroup. + This supports catching misbehaving processes and + return EMFILE instead of ENOMEM for kernel memory limits. + endif # CGROUPS
menuconfig NAMESPACES
From: Hou Tao houtao1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
this is bugs fix as mentioned, you can see issues for details
Merge OLK-5.10 fix: 26ba3a842868 Merge OLK-5.10 fix: 7e485fecbd7f Merge OLK-5.10 fix: a466f5dda467 Merge OLK-5.10 fix: 338fc539bb48 Merge OLK-5.10 fix: 1951d5627095 Merge OLK-5.10 fix: 69cbcd5b9a67 Merge OLK-5.10 fix: 0b7329771d3d Merge OLK-5.10 fix: e2b24a5adac6
Signed-off-by: chenridong chenridong@huawei.com --- .../admin-guide/kernel-parameters.txt | 7 +- fs/file.c | 31 +++-- fs/filescontrol.c | 115 ++++++++---------- include/linux/cgroup.h | 6 - include/linux/filescontrol.h | 6 + 5 files changed, 86 insertions(+), 79 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 41644336e358..fa4775c9d6fc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -565,9 +565,10 @@ - if foo is an optional feature then the feature is disabled and corresponding cgroup files are not created - {Currently only "memory" controller deal with this and - cut the overhead, others just disable the usage. So - only cgroup_disable=memory is actually worthy} + {Currently only "memory" and and "files" controller + deal with this and cut the overhead, others just + disable the usage. So only cgroup_disable=memory and + cgroup_disable=files are actually worthy} Specifying "pressure" disables per-cgroup pressure stall information accounting feature
diff --git a/fs/file.c b/fs/file.c index c6565d998f8c..a9558b693dba 100644 --- a/fs/file.c +++ b/fs/file.c @@ -339,7 +339,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; #ifdef CONFIG_CGROUP_FILES - files_cgroup_assign(newf); + if (files_cgroup_enabled()) + files_cgroup_assign(newf); #endif
spin_lock(&oldf->file_lock); @@ -405,10 +406,17 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
rcu_assign_pointer(newf->fdt, new_fdt); #ifdef CONFIG_CGROUP_FILES - if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) + if (!files_cgroup_enabled()) return newf; + spin_lock(&newf->file_lock); + if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) { + spin_unlock(&newf->file_lock); + return newf; + } + spin_unlock(&newf->file_lock); +
-/* could not get enough FD resources. Need to clean up. */ + /* could not get enough FD resources. Need to clean up. */ new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *new_fds++; @@ -425,7 +433,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
out_release: #ifdef CONFIG_CGROUP_FILES - files_cgroup_remove(newf); + if (files_cgroup_enabled()) + files_cgroup_remove(newf); #endif kmem_cache_free(files_cachep, newf); out: @@ -453,7 +462,8 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { #ifdef CONFIG_CGROUP_FILES - files_cgroup_unalloc_fd(files, 1); + if (files_cgroup_enabled()) + files_cgroup_unalloc_fd(files, 1); #endif filp_close(file, files); cond_resched(); @@ -463,6 +473,10 @@ static struct fdtable *close_files(struct files_struct * files) set >>= 1; } } +#ifdef CONFIG_CGROUP_FILES + if (files_cgroup_enabled()) + files_cgroup_remove(files); +#endif
return fdt; } @@ -558,7 +572,7 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags) if (error) goto repeat; #ifdef CONFIG_CGROUP_FILES - if (files_cgroup_alloc_fd(files, 1)) { + if (files_cgroup_enabled() && files_cgroup_alloc_fd(files, 1)) { error = -EMFILE; goto out; } @@ -601,7 +615,7 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); #ifdef CONFIG_CGROUP_FILES - if (test_bit(fd, fdt->open_fds)) + if (files_cgroup_enabled() && test_bit(fd, fdt->open_fds)) files_cgroup_unalloc_fd(files, 1); #endif __clear_open_fd(fd, fdt); @@ -1167,7 +1181,8 @@ __releases(&files->file_lock) goto out; } #ifdef CONFIG_CGROUP_FILES - if (!tofree && files_cgroup_alloc_fd(files, 1)) { + if (files_cgroup_enabled() && + !tofree && files_cgroup_alloc_fd(files, 1)) { err = -EMFILE; goto out; } diff --git a/fs/filescontrol.c b/fs/filescontrol.c index 44ad9ef44e20..4ad500f40025 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -25,14 +25,17 @@ #include <linux/seq_file.h> #include <linux/fdtable.h> #include <linux/sched/signal.h> +#include <linux/module.h>
-#define FILES_MAX ULLONG_MAX +#define FILES_MAX ULONG_MAX #define FILES_MAX_STR "max"
- +static bool no_acct; struct cgroup_subsys files_cgrp_subsys __read_mostly; EXPORT_SYMBOL(files_cgrp_subsys);
+module_param(no_acct, bool, 0444); + struct files_cgroup { struct cgroup_subsys_state css; struct page_counter open_handles; @@ -99,56 +102,14 @@ u64 files_cgroup_count_fds(struct files_struct *files) return retval; }
-static u64 files_in_taskset(struct cgroup_taskset *tset) -{ - struct task_struct *task; - u64 files = 0; - struct cgroup_subsys_state *css; - - cgroup_taskset_for_each(task, css, tset) { - if (!thread_group_leader(task)) - continue; - - task_lock(task); - files += files_cgroup_count_fds(task->files); - task_unlock(task); - } - return files; -} - /* * If attaching this cgroup would overcommit the resource then deny - * the attach. + * the attach. If not, attach the file resource into new cgroup. */ static int files_cgroup_can_attach(struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - unsigned long margin; - struct page_counter *cnt; - unsigned long counter; - u64 files = files_in_taskset(tset); - - cgroup_taskset_first(tset, &css); - cnt = css_res_open_handles(css); - - counter = (unsigned long)atomic_long_read(&cnt->usage); - if (cnt->max > counter) - margin = cnt->max - counter; - else - margin = 0; - if (margin < files) - return -ENOMEM; - return 0; -} - -/* - * If resource counts have gone up between can_attach and attach then - * this may overcommit resources. In that case just deny further allocation - * until the resource usage drops. - */ -static void files_cgroup_attach(struct cgroup_taskset *tset) { u64 num_files; + bool can_attach; struct cgroup_subsys_state *to_css; struct cgroup_subsys_state *from_css; struct page_counter *from_res; @@ -161,9 +122,9 @@ static void files_cgroup_attach(struct cgroup_taskset *tset)
task_lock(task); files = task->files; - if (!files) { + if (!files || files == &init_files) { task_unlock(task); - return; + return 0; }
from_css = &files_cgroup_from_files(files)->css; @@ -172,14 +133,20 @@ static void files_cgroup_attach(struct cgroup_taskset *tset) spin_lock(&files->file_lock); num_files = files_cgroup_count_fds(files); page_counter_uncharge(from_res, num_files); - css_put(from_css);
- if (!page_counter_try_charge(to_res, num_files, &fail_res)) + if (!page_counter_try_charge(to_res, num_files, &fail_res)) { + page_counter_charge(from_res, num_files); pr_err("Open files limit overcommited\n"); - css_get(to_css); - task->files->files_cgroup = css_fcg(to_css); + can_attach = false; + } else { + css_put(from_css); + css_get(to_css); + task->files->files_cgroup = css_fcg(to_css); + can_attach = true; + } spin_unlock(&files->file_lock); task_unlock(task); + return can_attach ? 0 : -ENOSPC; }
int files_cgroup_alloc_fd(struct files_struct *files, u64 n) @@ -194,7 +161,7 @@ int files_cgroup_alloc_fd(struct files_struct *files, u64 n) * we don't charge their fds, only issue is that files.usage * won't be accurate in root files cgroup. */ - if (files != &init_files) { + if (!no_acct && files != &init_files) { struct page_counter *fail_res; struct files_cgroup *files_cgroup = files_cgroup_from_files(files); @@ -212,7 +179,7 @@ void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) * It's not charged so no need to uncharge, see comments in * files_cgroup_alloc_fd. */ - if (files != &init_files) { + if (!no_acct && files != &init_files) { struct files_cgroup *files_cgroup = files_cgroup_from_files(files); page_counter_uncharge(&files_cgroup->open_handles, n); @@ -220,6 +187,21 @@ void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) } EXPORT_SYMBOL(files_cgroup_unalloc_fd);
+static u64 files_disabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return no_acct; +} + +static int files_disabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (!val) + return -EINVAL; + no_acct = true; + + return 0; +}
static int files_limit_read(struct seq_file *sf, void *v) { @@ -281,6 +263,12 @@ static struct cftype files[] = { .name = "usage", .read_u64 = files_usage_read, }, + { + .name = "no_acct", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = files_disabled_read, + .write_u64 = files_disabled_write, + }, { } };
@@ -288,23 +276,23 @@ struct cgroup_subsys files_cgrp_subsys = { .css_alloc = files_cgroup_css_alloc, .css_free = files_cgroup_css_free, .can_attach = files_cgroup_can_attach, - .attach = files_cgroup_attach, .legacy_cftypes = files, .dfl_cftypes = files, };
+/* + * It could race against cgroup migration of current task, and + * using task_get_css() to get a valid css. + */ void files_cgroup_assign(struct files_struct *files) { - struct task_struct *tsk = current; struct cgroup_subsys_state *css; - struct cgroup *cgrp;
- task_lock(tsk); - cgrp = task_cgroup(tsk, files_cgrp_id); - css = cgroup_subsys_state(cgrp, files_cgrp_id); - css_get(css); + if (files == &init_files) + return; + + css = task_get_css(current, files_cgrp_id); files->files_cgroup = container_of(css, struct files_cgroup, css); - task_unlock(tsk); }
void files_cgroup_remove(struct files_struct *files) @@ -312,6 +300,9 @@ void files_cgroup_remove(struct files_struct *files) struct task_struct *tsk = current; struct files_cgroup *fcg;
+ if (files == &init_files) + return; + task_lock(tsk); spin_lock(&files->file_lock); fcg = files_cgroup_from_files(files); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1cb90ea3299a..7fa51b600ee8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -362,12 +362,6 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); }
-static inline struct cgroup_subsys_state *cgroup_subsys_state( - struct cgroup *cgrp, int subsys_id) -{ - return cgrp->subsys[subsys_id]; -} - extern struct mutex cgroup_mutex;
static inline void cgroup_lock(void) diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h index 49dc620cf64e..0182f145a339 100644 --- a/include/linux/filescontrol.h +++ b/include/linux/filescontrol.h @@ -19,6 +19,7 @@ #define _LINUX_FILESCONTROL_H
#include <linux/fdtable.h> +#include <linux/cgroup.h>
#ifdef CONFIG_CGROUP_FILES
@@ -30,5 +31,10 @@ extern struct files_struct init_files; void files_cgroup_assign(struct files_struct *files); void files_cgroup_remove(struct files_struct *files);
+static inline bool files_cgroup_enabled(void) +{ + return cgroup_subsys_enabled(files_cgrp_subsys); +} + #endif /* CONFIG_CGROUP_FILES */ #endif /* _LINUX_FILESCONTROL_H */
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
set CONFIG_CGROUP_FILES=y in openeuler_defconfig for x86 and arm64.
Signed-off-by: Lu Jialin lujialin4@huawei.com Signed-off-by: chenridong chenridong@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 33ba39711884..030359bc1b07 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -181,6 +181,7 @@ CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_CGROUP_V1_KILL=y +CONFIG_CGROUP_FILES=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_TIME_NS=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 44040b835333..515e44d71457 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -203,6 +203,7 @@ CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_CGROUP_V1_KILL=y +CONFIG_CGROUP_FILES=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_TIME_NS=y
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3524 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3524 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...