From: Binder Makin merimus@google.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I
--------------------------------
Add a lockless resource controller for limiting the number of open file handles. This allows us to catch misbehaving processes and return EMFILE instead of ENOMEM for kernel memory limits.
Original link: https://lwn.net/Articles/604129/.After introduced https://gitlab.indel.ch/thirdparty/linux-indel/commit /5b1efc027c0b51ca3e76f4e00c83358f8349f543. All memory accounting and limiting has been switched over to the lockless page counters. So we convert original resource counters to lockless page counters.
Signed-off-by: Binder Makin merimus@google.com Signed-off-by: chenridong chenridong@huawei.com --- fs/Makefile | 1 + fs/file.c | 56 +++++- fs/filescontrol.c | 312 ++++++++++++++++++++++++++++++++++ include/linux/cgroup-defs.h | 8 +- include/linux/cgroup_subsys.h | 4 + include/linux/fdtable.h | 1 + include/linux/filescontrol.h | 34 ++++ init/Kconfig | 10 ++ 8 files changed, 421 insertions(+), 5 deletions(-) create mode 100644 fs/filescontrol.c create mode 100644 include/linux/filescontrol.h
diff --git a/fs/Makefile b/fs/Makefile index f9541f40be4e..bb6164200a39 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o
obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_CGROUP_FILES) += filescontrol.o obj-y += iomap/
obj-y += quota/ diff --git a/fs/file.c b/fs/file.c index 3e4a4dfa38fc..b44712c62758 100644 --- a/fs/file.c +++ b/fs/file.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> +#include <linux/filescontrol.h> #include <linux/close_range.h> #include <net/sock.h>
@@ -337,6 +338,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; +#ifdef CONFIG_CGROUP_FILES + files_cgroup_assign(newf); +#endif
spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -400,10 +404,29 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
rcu_assign_pointer(newf->fdt, new_fdt); +#ifdef CONFIG_CGROUP_FILES + if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) + return newf; + +/* could not get enough FD resources. Need to clean up. */ + new_fds = new_fdt->fd; + for (i = open_files; i != 0; i--) { + struct file *f = *new_fds++;
+ if (f) + fput(f); + } + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); + *errorp = -EMFILE; +#else return newf; +#endif
out_release: +#ifdef CONFIG_CGROUP_FILES + files_cgroup_remove(newf); +#endif kmem_cache_free(files_cachep, newf); out: return NULL; @@ -429,6 +452,9 @@ static struct fdtable *close_files(struct files_struct * files) if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { +#ifdef CONFIG_CGROUP_FILES + files_cgroup_unalloc_fd(files, 1); +#endif filp_close(file, files); cond_resched(); } @@ -437,6 +463,9 @@ static struct fdtable *close_files(struct files_struct * files) set >>= 1; } } +#ifdef CONFIG_CGROUP_FILES + files_cgroup_remove(files); +#endif
return fdt; } @@ -531,6 +560,12 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags) */ if (error) goto repeat; +#ifdef CONFIG_CGROUP_FILES + if (files_cgroup_alloc_fd(files, 1)) { + error = -EMFILE; + goto out; + } +#endif
if (start <= files->next_fd) files->next_fd = fd + 1; @@ -568,6 +603,10 @@ EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); +#ifdef CONFIG_CGROUP_FILES + if (test_bit(fd, fdt->open_fds)) + files_cgroup_unalloc_fd(files, 1); +#endif __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; @@ -1106,6 +1145,7 @@ static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { + int err; struct file *tofree; struct fdtable *fdt;
@@ -1125,8 +1165,16 @@ __releases(&files->file_lock) */ fdt = files_fdtable(files); tofree = fdt->fd[fd]; - if (!tofree && fd_is_open(fd, fdt)) - goto Ebusy; + if (!tofree && fd_is_open(fd, fdt)) { + err = -EBUSY; + goto out; + } +#ifdef CONFIG_CGROUP_FILES + if (!tofree && files_cgroup_alloc_fd(files, 1)) { + err = -EMFILE; + goto out; + } +#endif get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); @@ -1141,9 +1189,9 @@ __releases(&files->file_lock)
return fd;
-Ebusy: +out: spin_unlock(&files->file_lock); - return -EBUSY; + return err; }
int replace_fd(unsigned fd, struct file *file, unsigned flags) diff --git a/fs/filescontrol.c b/fs/filescontrol.c new file mode 100644 index 000000000000..4ad500f40025 --- /dev/null +++ b/fs/filescontrol.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0 +/* filescontrol.c - Cgroup controller for open file handles. + * + * Copyright 2014 Google Inc. + * Author: Brian Makin merimus@google.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/page_counter.h> +#include <linux/filescontrol.h> +#include <linux/cgroup.h> +#include <linux/export.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/fdtable.h> +#include <linux/sched/signal.h> +#include <linux/module.h> + +#define FILES_MAX ULONG_MAX +#define FILES_MAX_STR "max" + +static bool no_acct; +struct cgroup_subsys files_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(files_cgrp_subsys); + +module_param(no_acct, bool, 0444); + +struct files_cgroup { + struct cgroup_subsys_state css; + struct page_counter open_handles; +}; + +static inline struct files_cgroup *css_fcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct files_cgroup, css) : NULL; +} + +static inline struct page_counter * +css_res_open_handles(struct cgroup_subsys_state *css) +{ + return &css_fcg(css)->open_handles; +} + +static inline struct files_cgroup * +files_cgroup_from_files(struct files_struct *files) +{ + return files->files_cgroup; +} + + +static struct cgroup_subsys_state * +files_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct files_cgroup *parent_fcg; + struct files_cgroup *fcg; + + parent_fcg = css_fcg(parent_css); + fcg = kzalloc(sizeof(*fcg), GFP_KERNEL); + if (!fcg) + goto out; + + if (!parent_fcg) { + page_counter_init(&fcg->open_handles, NULL); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } else { + struct page_counter *p_counter = &parent_fcg->open_handles; + + page_counter_init(&fcg->open_handles, p_counter); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } + return &fcg->css; + +out: + return ERR_PTR(-ENOMEM); +} + +static void files_cgroup_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_fcg(css)); +} + +u64 files_cgroup_count_fds(struct files_struct *files) +{ + int i; + struct fdtable *fdt; + int retval = 0; + + fdt = files_fdtable(files); + for (i = 0; i < DIV_ROUND_UP(fdt->max_fds, BITS_PER_LONG); i++) + retval += hweight64((__u64)fdt->open_fds[i]); + return retval; +} + +/* + * If attaching this cgroup would overcommit the resource then deny + * the attach. If not, attach the file resource into new cgroup. + */ +static int files_cgroup_can_attach(struct cgroup_taskset *tset) +{ + u64 num_files; + bool can_attach; + struct cgroup_subsys_state *to_css; + struct cgroup_subsys_state *from_css; + struct page_counter *from_res; + struct page_counter *to_res; + struct page_counter *fail_res; + struct files_struct *files; + struct task_struct *task = cgroup_taskset_first(tset, &to_css); + + to_res = css_res_open_handles(to_css); + + task_lock(task); + files = task->files; + if (!files || files == &init_files) { + task_unlock(task); + return 0; + } + + from_css = &files_cgroup_from_files(files)->css; + from_res = css_res_open_handles(from_css); + + spin_lock(&files->file_lock); + num_files = files_cgroup_count_fds(files); + page_counter_uncharge(from_res, num_files); + + if (!page_counter_try_charge(to_res, num_files, &fail_res)) { + page_counter_charge(from_res, num_files); + pr_err("Open files limit overcommited\n"); + can_attach = false; + } else { + css_put(from_css); + css_get(to_css); + task->files->files_cgroup = css_fcg(to_css); + can_attach = true; + } + spin_unlock(&files->file_lock); + task_unlock(task); + return can_attach ? 0 : -ENOSPC; +} + +int files_cgroup_alloc_fd(struct files_struct *files, u64 n) +{ + /* + * Kernel threads which are forked by kthreadd inherited the + * const files_struct 'init_files', we didn't wrap it so + * there's no associated files_cgroup. + * + * Kernel threads always stay in root cgroup, and we don't + * have limit for root files cgroup, so it won't hurt if + * we don't charge their fds, only issue is that files.usage + * won't be accurate in root files cgroup. + */ + if (!no_acct && files != &init_files) { + struct page_counter *fail_res; + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + if (!page_counter_try_charge(&files_cgroup->open_handles, + n, &fail_res)) + return -ENOMEM; + } + return 0; +} +EXPORT_SYMBOL(files_cgroup_alloc_fd); + +void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) +{ + /* + * It's not charged so no need to uncharge, see comments in + * files_cgroup_alloc_fd. + */ + if (!no_acct && files != &init_files) { + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + page_counter_uncharge(&files_cgroup->open_handles, n); + } +} +EXPORT_SYMBOL(files_cgroup_unalloc_fd); + +static u64 files_disabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return no_acct; +} + +static int files_disabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (!val) + return -EINVAL; + no_acct = true; + + return 0; +} + +static int files_limit_read(struct seq_file *sf, void *v) +{ + struct files_cgroup *fcg = css_fcg(seq_css(sf)); + struct page_counter *counter = &fcg->open_handles; + u64 limit = counter->max; + + if (limit >= FILES_MAX) + seq_printf(sf, "%s\n", FILES_MAX_STR); + else + seq_printf(sf, "%llu\n", limit); + + return 0; +} + +static ssize_t files_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct files_cgroup *fcg = css_fcg(of_css(of)); + u64 limit; + int err; + + buf = strstrip((char *)buf); + if (!strcmp(buf, FILES_MAX_STR)) { + limit = FILES_MAX; + goto set_limit; + } + + err = kstrtoull(buf, 0, &limit); + if (err) + return err; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + page_counter_set_max(&fcg->open_handles, limit); + return nbytes; +} + + +static u64 files_usage_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct files_cgroup *fcg = css_fcg(css); + + return page_counter_read(&fcg->open_handles); +} + +static struct cftype files[] = { + { + .name = "limit", + .seq_show = files_limit_read, + .write = files_limit_write, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "usage", + .read_u64 = files_usage_read, + }, + { + .name = "no_acct", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = files_disabled_read, + .write_u64 = files_disabled_write, + }, + { } +}; + +struct cgroup_subsys files_cgrp_subsys = { + .css_alloc = files_cgroup_css_alloc, + .css_free = files_cgroup_css_free, + .can_attach = files_cgroup_can_attach, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; + +/* + * It could race against cgroup migration of current task, and + * using task_get_css() to get a valid css. + */ +void files_cgroup_assign(struct files_struct *files) +{ + struct cgroup_subsys_state *css; + + if (files == &init_files) + return; + + css = task_get_css(current, files_cgrp_id); + files->files_cgroup = container_of(css, struct files_cgroup, css); +} + +void files_cgroup_remove(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_cgroup *fcg; + + if (files == &init_files) + return; + + task_lock(tsk); + spin_lock(&files->file_lock); + fcg = files_cgroup_from_files(files); + css_put(&fcg->css); + spin_unlock(&files->file_lock); + task_unlock(tsk); +} diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b..ad632b340c7b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -653,6 +653,12 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off);
+ int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, + struct seq_file *m); + + int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + const char *buffer); + __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt);
@@ -742,7 +748,7 @@ struct cgroup_subsys { */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ - + struct cftype *base_cftypes; /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 445235487230..85fa78049bd0 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -72,6 +72,10 @@ SUBSYS(misc) SUBSYS(debug) #endif
+#if IS_ENABLED(CONFIG_CGROUP_FILES) +SUBSYS(files) +#endif + /* * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519..22b8b03fef6d 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -65,6 +65,7 @@ struct files_struct { unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; + struct files_cgroup *files_cgroup; };
struct file_operations; diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h new file mode 100644 index 000000000000..49dc620cf64e --- /dev/null +++ b/include/linux/filescontrol.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* filescontrol.h - Files Controller + * + * Copyright 2014 Google Inc. + * Author: Brian Makin merimus@google.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_FILESCONTROL_H +#define _LINUX_FILESCONTROL_H + +#include <linux/fdtable.h> + +#ifdef CONFIG_CGROUP_FILES + +extern int files_cgroup_alloc_fd(struct files_struct *files, u64 n); +extern void files_cgroup_unalloc_fd(struct files_struct *files, u64 n); +extern u64 files_cgroup_count_fds(struct files_struct *files); +extern struct files_struct init_files; + +void files_cgroup_assign(struct files_struct *files); +void files_cgroup_remove(struct files_struct *files); + +#endif /* CONFIG_CGROUP_FILES */ +#endif /* _LINUX_FILESCONTROL_H */ diff --git a/init/Kconfig b/init/Kconfig index 2ee1384c4f81..418f94fb43b1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1197,6 +1197,16 @@ config CGROUP_V1_KILL default n depends on CGROUPS
+config CGROUP_FILES + bool "Files Resource Controller for Control Groups" + select PAGE_COUNTER + default n + help + Provides a cgroup resource controller that limits number of open + file handles within a cgroup. + This supports catching misbehaving processes and + return EMFILE instead of ENOMEM for kernel memory limits. + endif # CGROUPS
menuconfig NAMESPACES