[PATCH OLK-6.6 1/4] cgroups: Resource controller for open files

26 Dec 2023

From: Binder Makin <merimus@google.com>

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8ND8I

--------------------------------

Add a lockless resource controller for limiting the number of open
file handles.  This allows us to catch misbehaving processes
and return EMFILE instead of ENOMEM for kernel memory limits.

Original link: https://lwn.net/Articles/604129/.After introduced
https://gitlab.indel.ch/thirdparty/linux-indel/commit
/5b1efc027c0b51ca3e76f4e00c83358f8349f543.
All memory accounting and limiting has been switched over to the
lockless page counters. So we convert original resource counters to
lockless page counters.

The external functions are empty when CONFIG_CGROUP_FILES is disable,
that was refactored refered to original patch.

Signed-off-by: Binder Makin <merimus@google.com>

Conflicts:
Merge OLK-5.10 fix patch: e2b24a5adac6
Merge OLK-5.10 fix patch: 7e485fecbd7f
Merge OLK-5.10 fix patch: a466f5dda467
Merge OLK-5.10 fix patch: 338fc539bb48
Merge OLK-5.10 fix patch: 1951d5627095
Merge OLK-5.10 fix patch: 0b7329771d3d

Signed-off-by: chenridong <chenridong@huawei.com>
---
 fs/Makefile                   |   1 +
 fs/file.c                     |  43 ++++-
 fs/filescontrol.c             | 303 ++++++++++++++++++++++++++++++++++
 include/linux/cgroup_subsys.h |   4 +
 include/linux/fdtable.h       |   1 +
 include/linux/filescontrol.h  |  44 +++++
 init/Kconfig                  |  10 ++
 7 files changed, 400 insertions(+), 6 deletions(-)
 create mode 100644 fs/filescontrol.c
 create mode 100644 include/linux/filescontrol.h

diff --git a/fs/Makefile b/fs/Makefile
index f9541f40be4e..bb6164200a39 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)		+= drop_caches.o sysctls.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
+obj-$(CONFIG_CGROUP_FILES)	+= filescontrol.o
 obj-y				+= iomap/
 
 obj-y				+= quota/
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..fdd36a423850 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -19,6 +19,7 @@
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/filescontrol.h>
 #include <linux/close_range.h>
 #include <net/sock.h>
 
@@ -337,6 +338,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 	new_fdt->open_fds = newf->open_fds_init;
 	new_fdt->full_fds_bits = newf->full_fds_bits_init;
 	new_fdt->fd = &newf->fd_array[0];
+	files_cgroup_assign(newf);
 
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
@@ -401,9 +403,23 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 
 	rcu_assign_pointer(newf->fdt, new_fdt);
 
-	return newf;
+	if (!files_cgroup_get_fd(newf)) {
+		return newf;
+	}
+	/* could not get enough FD resources.  Need to clean up. */
+	new_fds = new_fdt->fd;
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *new_fds++;
+
+		if (f)
+			fput(f);
+	}
+	if (new_fdt != &newf->fdtab)
+		__free_fdtable(new_fdt);
+	*errorp = -EMFILE;
 
 out_release:
+	files_cgroup_remove(newf);
 	kmem_cache_free(files_cachep, newf);
 out:
 	return NULL;
@@ -429,6 +445,7 @@ static struct fdtable *close_files(struct files_struct * files)
 			if (set & 1) {
 				struct file * file = xchg(&fdt->fd[i], NULL);
 				if (file) {
+					files_cgroup_unalloc_fd(files, 1);
 					filp_close(file, files);
 					cond_resched();
 				}
@@ -437,7 +454,7 @@ static struct fdtable *close_files(struct files_struct * files)
 			set >>= 1;
 		}
 	}
-
+	files_cgroup_remove(files);
 	return fdt;
 }
 
@@ -531,6 +548,10 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags)
 	 */
 	if (error)
 		goto repeat;
+	if (files_cgroup_alloc_fd(files, 1)) {
+		error = -EMFILE;
+		goto out;
+	}
 
 	if (start <= files->next_fd)
 		files->next_fd = fd + 1;
@@ -568,6 +589,8 @@ EXPORT_SYMBOL(get_unused_fd_flags);
 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
+
+	files_cgroup_put_fd(files, fd);
 	__clear_open_fd(fd, fdt);
 	if (fd < files->next_fd)
 		files->next_fd = fd;
@@ -1106,6 +1129,7 @@ static int do_dup2(struct files_struct *files,
 	struct file *file, unsigned fd, unsigned flags)
 __releases(&files->file_lock)
 {
+	int err;
 	struct file *tofree;
 	struct fdtable *fdt;
 
@@ -1125,8 +1149,15 @@ __releases(&files->file_lock)
 	 */
 	fdt = files_fdtable(files);
 	tofree = fdt->fd[fd];
-	if (!tofree && fd_is_open(fd, fdt))
-		goto Ebusy;
+	if (!tofree && fd_is_open(fd, fdt)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (!tofree && files_cgroup_alloc_fd(files, 1)) {
+		err = -EMFILE;
+		goto out;
+	}
 	get_file(file);
 	rcu_assign_pointer(fdt->fd[fd], file);
 	__set_open_fd(fd, fdt);
@@ -1141,9 +1172,9 @@ __releases(&files->file_lock)
 
 	return fd;
 
-Ebusy:
+out:
 	spin_unlock(&files->file_lock);
-	return -EBUSY;
+	return err;
 }
 
 int replace_fd(unsigned fd, struct file *file, unsigned flags)
diff --git a/fs/filescontrol.c b/fs/filescontrol.c
new file mode 100644
index 000000000000..ff71efa82be1
--- /dev/null
+++ b/fs/filescontrol.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+/* filescontrol.c - Cgroup controller for open file handles.
+ *
+ * Copyright 2014 Google Inc.
+ * Author: Brian Makin <merimus@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/page_counter.h>
+#include <linux/filescontrol.h>
+#include <linux/cgroup.h>
+#include <linux/export.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/fdtable.h>
+#include <linux/sched/signal.h>
+
+#define FILES_MAX ULONG_MAX
+#define FILES_MAX_STR "max"
+
+
+struct cgroup_subsys files_cgrp_subsys __read_mostly;
+EXPORT_SYMBOL(files_cgrp_subsys);
+
+struct files_cgroup {
+	struct cgroup_subsys_state css;
+	struct page_counter open_handles;
+};
+
+static inline struct files_cgroup *css_fcg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct files_cgroup, css) : NULL;
+}
+
+static inline struct page_counter *
+css_res_open_handles(struct cgroup_subsys_state *css)
+{
+	return &css_fcg(css)->open_handles;
+}
+
+static inline struct files_cgroup *
+files_cgroup_from_files(struct files_struct *files)
+{
+	return files->files_cgroup;
+}
+
+
+static struct cgroup_subsys_state *
+files_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct files_cgroup *parent_fcg;
+	struct files_cgroup *fcg;
+
+	parent_fcg = css_fcg(parent_css);
+	fcg = kzalloc(sizeof(*fcg), GFP_KERNEL);
+	if (!fcg)
+		goto out;
+
+	if (!parent_fcg) {
+		page_counter_init(&fcg->open_handles, NULL);
+		page_counter_set_max(&fcg->open_handles, FILES_MAX);
+	} else {
+		struct page_counter *p_counter = &parent_fcg->open_handles;
+
+		page_counter_init(&fcg->open_handles, p_counter);
+		page_counter_set_max(&fcg->open_handles, FILES_MAX);
+	}
+	return &fcg->css;
+
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void files_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_fcg(css));
+}
+
+u64 files_cgroup_count_fds(struct files_struct *files)
+{
+	int i;
+	struct fdtable *fdt;
+	int retval = 0;
+
+	fdt = files_fdtable(files);
+	for (i = 0; i < DIV_ROUND_UP(fdt->max_fds, BITS_PER_LONG); i++)
+		retval += hweight64((__u64)fdt->open_fds[i]);
+	return retval;
+}
+
+/*
+ * If attaching this cgroup would overcommit the resource then deny
+ * the attach. If not, attach the file resource into new cgroup.
+ */
+static int files_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+	u64 num_files;
+	bool can_attach;
+	struct cgroup_subsys_state *to_css;
+	struct cgroup_subsys_state *from_css;
+	struct page_counter *from_res;
+	struct page_counter *to_res;
+	struct page_counter *fail_res;
+	struct files_struct *files;
+	struct task_struct *task = cgroup_taskset_first(tset, &to_css);
+
+	to_res = css_res_open_handles(to_css);
+
+	task_lock(task);
+	files = task->files;
+	if (!files || files == &init_files) {
+		task_unlock(task);
+		return 0;
+	}
+
+	from_css = &files_cgroup_from_files(files)->css;
+	from_res = css_res_open_handles(from_css);
+
+	spin_lock(&files->file_lock);
+	num_files = files_cgroup_count_fds(files);
+	page_counter_uncharge(from_res, num_files);
+
+	if (!page_counter_try_charge(to_res, num_files, &fail_res)) {
+		page_counter_charge(from_res, num_files);
+		pr_err("Open files limit overcommited\n");
+		can_attach = false;
+	} else {
+		css_put(from_css);
+		css_get(to_css);
+		task->files->files_cgroup = css_fcg(to_css);
+		can_attach = true;
+	}
+	spin_unlock(&files->file_lock);
+	task_unlock(task);
+	return can_attach ? 0 : -ENOSPC;
+}
+
+int files_cgroup_alloc_fd(struct files_struct *files, u64 n)
+{
+	/*
+	 * Kernel threads which are forked by kthreadd inherited the
+	 * const files_struct 'init_files', we didn't wrap it so
+	 * there's no associated files_cgroup.
+	 *
+	 *  Kernel threads always stay in root cgroup, and we don't
+	 *  have limit for root files cgroup, so it won't hurt if
+	 *  we don't charge their fds, only issue is that files.usage
+	 *  won't be accurate in root files cgroup.
+	 */
+	if (files != &init_files) {
+		struct page_counter *fail_res;
+		struct files_cgroup *files_cgroup =
+			files_cgroup_from_files(files);
+		if (!page_counter_try_charge(&files_cgroup->open_handles,
+				       n, &fail_res))
+			return -ENOMEM;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(files_cgroup_alloc_fd);
+
+void files_cgroup_unalloc_fd(struct files_struct *files, u64 n)
+{
+	/*
+	 * It's not charged so no need to uncharge, see comments in
+	 * files_cgroup_alloc_fd.
+	 */
+	if (files != &init_files) {
+		struct files_cgroup *files_cgroup =
+		       files_cgroup_from_files(files);
+		page_counter_uncharge(&files_cgroup->open_handles, n);
+	}
+}
+EXPORT_SYMBOL(files_cgroup_unalloc_fd);
+
+void files_cgroup_put_fd(struct files_struct *files, unsigned int fd)
+{
+	struct fdtable *fdt = files_fdtable(files);
+
+	if (test_bit(fd, fdt->open_fds))
+		return files_cgroup_unalloc_fd(files, 1);
+}
+
+int files_cgroup_get_fd(struct files_struct *newf)
+{
+	spin_lock(&newf->file_lock);
+	int err = files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf));
+	spin_unlock(&newf->file_lock);
+	return err;
+}
+
+static int files_limit_read(struct seq_file *sf, void *v)
+{
+	struct files_cgroup *fcg = css_fcg(seq_css(sf));
+	struct page_counter *counter = &fcg->open_handles;
+	u64 limit = counter->max;
+
+	if (limit >= FILES_MAX)
+		seq_printf(sf, "%s\n", FILES_MAX_STR);
+	else
+		seq_printf(sf, "%llu\n", limit);
+
+	return 0;
+}
+
+static ssize_t files_limit_write(struct kernfs_open_file *of,
+			char *buf, size_t nbytes, loff_t off)
+{
+	struct files_cgroup *fcg = css_fcg(of_css(of));
+	u64 limit;
+	int err;
+
+	buf = strstrip((char *)buf);
+	if (!strcmp(buf, FILES_MAX_STR)) {
+		limit = FILES_MAX;
+		goto set_limit;
+	}
+
+	err = kstrtoull(buf, 0, &limit);
+	if (err)
+		return err;
+
+set_limit:
+	/*
+	 * Limit updates don't need to be mutex'd, since it isn't
+	 * critical that any racing fork()s follow the new limit.
+	 */
+	page_counter_set_max(&fcg->open_handles, limit);
+	return nbytes;
+}
+
+
+static u64 files_usage_read(struct cgroup_subsys_state *css,
+			struct cftype *cft)
+{
+	struct files_cgroup *fcg = css_fcg(css);
+
+	return page_counter_read(&fcg->open_handles);
+}
+
+static struct cftype files[] = {
+	{
+		.name = "limit",
+		.seq_show  = files_limit_read,
+		.write = files_limit_write,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "usage",
+		.read_u64 = files_usage_read,
+	},
+	{ }
+};
+
+struct cgroup_subsys files_cgrp_subsys = {
+	.css_alloc = files_cgroup_css_alloc,
+	.css_free = files_cgroup_css_free,
+	.can_attach = files_cgroup_can_attach,
+	.legacy_cftypes = files,
+	.dfl_cftypes = files,
+};
+
+/*
+ * It could race against cgroup migration of current task, and
+ * using task_get_css() to get a valid css.
+ */
+void files_cgroup_assign(struct files_struct *files)
+{
+	struct cgroup_subsys_state *css;
+
+	if (files == &init_files)
+		return;
+
+	css = task_get_css(current, files_cgrp_id);
+	files->files_cgroup = container_of(css, struct files_cgroup, css);
+}
+
+void files_cgroup_remove(struct files_struct *files)
+{
+	struct task_struct *tsk = current;
+	struct files_cgroup *fcg;
+
+	if (files == &init_files)
+		return;
+
+	task_lock(tsk);
+	spin_lock(&files->file_lock);
+	fcg = files_cgroup_from_files(files);
+	css_put(&fcg->css);
+	spin_unlock(&files->file_lock);
+	task_unlock(tsk);
+}
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 445235487230..85fa78049bd0 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -72,6 +72,10 @@ SUBSYS(misc)
 SUBSYS(debug)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_FILES)
+SUBSYS(files)
+#endif
+
 /*
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..22b8b03fef6d 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -65,6 +65,7 @@ struct files_struct {
 	unsigned long open_fds_init[1];
 	unsigned long full_fds_bits_init[1];
 	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
+	struct files_cgroup *files_cgroup;
 };
 
 struct file_operations;
diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h
new file mode 100644
index 000000000000..07403ada16d7
--- /dev/null
+++ b/include/linux/filescontrol.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* filescontrol.h - Files Controller
+ *
+ * Copyright 2014 Google Inc.
+ * Author: Brian Makin <merimus@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_FILESCONTROL_H
+#define _LINUX_FILESCONTROL_H
+
+#include <linux/fdtable.h>
+
+#ifdef CONFIG_CGROUP_FILES
+extern int files_cgroup_alloc_fd(struct files_struct *files, u64 n);
+extern void files_cgroup_unalloc_fd(struct files_struct *files, u64 n);
+
+extern struct files_struct init_files;
+extern void files_cgroup_assign(struct files_struct *files);
+extern void files_cgroup_remove(struct files_struct *files);
+
+extern int files_cgroup_get_fd(struct files_struct *newf);
+extern void files_cgroup_put_fd(struct files_struct *files, unsigned int fd);
+#else /* no CONFIG_CGROUP_FILES */
+static inline int files_cgroup_alloc_fd(struct files_struct *files, u64 n){return 0;};
+static inline void files_cgroup_unalloc_fd(struct files_struct *files, u64 n){};
+
+static inline void files_cgroup_assign(struct files_struct *files){};
+static inline void files_cgroup_remove(struct files_struct *files){};
+
+static inline int files_cgroup_get_fd(struct files_struct *newf){return 0;};
+static inline void files_cgroup_put_fd(struct files_struct *files, unsigned int fd){};
+#endif /* CONFIG_CGROUP_FILES */
+
+#endif /* _LINUX_FILESCONTROL_H */
diff --git a/init/Kconfig b/init/Kconfig
index 9209fc5b39b9..ca89d9787c71 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1212,6 +1212,16 @@ config CGROUP_V1_KILL
 	default n
 	depends on CGROUPS
 
+config CGROUP_FILES
+	bool "Files Resource Controller for Control Groups"
+	select PAGE_COUNTER
+	default n
+	help
+	  Provides a cgroup resource controller that limits number of open
+	  file handles within a cgroup.
+	  This supports catching misbehaving processes and
+	  return EMFILE instead of ENOMEM for kernel memory limits.
+
 endif # CGROUPS
 
 menuconfig NAMESPACES
-- 
2.34.1