From: Li Zefan lizefan@huawei.com
euler inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8TCPY
-------------------------------------------------
The adjusted pid_max on the host also takes effect in the container. Move pid_max to pid_namespace to fix this problem.
Signed-off-by: Pavel Emelyanov xemul@parallels.com Signed-off-by: Yi Yang yiyang13@huawei.com --- include/linux/pid.h | 2 ++ include/linux/pid_namespace.h | 3 ++ init/Kconfig | 6 ++++ kernel/pid.c | 57 +++++++++++++++++++++++++++++++++++ kernel/pid_namespace.c | 13 ++++++++ kernel/sysctl.c | 2 ++ kernel/trace/pid_list.c | 4 +++ kernel/trace/trace.c | 4 +++ kernel/trace/trace.h | 2 ++ 9 files changed, 93 insertions(+)
diff --git a/include/linux/pid.h b/include/linux/pid.h index 653a527574c4..b90bc447d2a2 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -113,8 +113,10 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new, struct pid_namespace; extern struct pid_namespace init_pid_ns;
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE extern int pid_max; extern int pid_max_min, pid_max_max; +#endif
/* * look up a PID in the hash table. Must be called with the tasklist_lock diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f9f9931e02d6..0a7c6dab0b7b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -36,6 +36,9 @@ struct pid_namespace { #endif struct user_namespace *user_ns; struct ucounts *ucounts; +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + int pid_max; +#endif int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) diff --git a/init/Kconfig b/init/Kconfig index 869eea4108d0..8c07d158bb54 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2104,3 +2104,9 @@ config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE # <asm/syscall_wrapper.h>. config ARCH_HAS_SYSCALL_WRAPPER def_bool n + +config PID_MAX_PER_NAMESPACE + bool "Make pid_max per namespace" + default y + help + Say Y here to enable make pid_max per namespace. diff --git a/kernel/pid.c b/kernel/pid.c index 383abde0c208..4e9267988dd7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -45,6 +45,10 @@ #include <net/sock.h> #include <uapi/linux/pidfd.h>
+#ifdef CONFIG_PID_MAX_PER_NAMESPACE +#include <linux/kmemleak.h> +#endif + struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), .tasks = { @@ -59,12 +63,19 @@ struct pid init_struct_pid = { }, } };
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE int pid_max = PID_MAX_DEFAULT; +#endif
#define RESERVED_PIDS 300
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#else +static int pid_max_min = RESERVED_PIDS + 1; +static int pid_max_max = PID_MAX_LIMIT; +#endif
/* * PID-map pages start out as NULL, they get allocated upon @@ -80,6 +91,9 @@ struct pid_namespace init_pid_ns = { .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + .pid_max = PID_MAX_DEFAULT, +#endif #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif @@ -194,7 +208,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tid = set_tid[ns->level - i];
retval = -EINVAL; +#ifndef CONFIG_PID_MAX_PER_NAMESPACE if (tid < 1 || tid >= pid_max) +#else + if (tid < 1 || tid >= task_active_pid_ns(current)->pid_max) +#endif goto out_free; /* * Also fail if a PID != 1 is requested and @@ -234,7 +252,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, +#ifndef CONFIG_PID_MAX_PER_NAMESPACE pid_max, GFP_ATOMIC); +#else + tmp->pid_max, GFP_ATOMIC); +#endif } spin_unlock_irq(&pidmap_lock); idr_preload_end(); @@ -646,8 +668,37 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return fd; }
+#ifdef CONFIG_PID_MAX_PER_NAMESPACE +static int proc_dointvec_pidmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp; + + tmp = *table; + tmp.data = &task_active_pid_ns(current)->pid_max; + + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ctl_table[] = { + { + .procname = "pid_max", + .data = &init_pid_ns.pid_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_pidmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + {} +}; +#endif + void __init pid_idr_init(void) { +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + int pid_max = init_pid_ns.pid_max; +#endif /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
@@ -658,6 +709,9 @@ void __init pid_idr_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+#ifdef CONFIG_PID_MAX_PER_NAMESPACE + init_pid_ns.pid_max = pid_max; +#endif idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = kmem_cache_create("pid", @@ -665,6 +719,9 @@ void __init pid_idr_init(void) __alignof__(struct pid), SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + register_sysctl_init("kernel", pid_ctl_table); +#endif }
static struct file *__pidfd_fget(struct task_struct *task, int fd) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 619972c78774..c837b1096dcb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -110,6 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + ns->pid_max = parent_pid_ns->pid_max; +#endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif @@ -295,6 +298,10 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next; +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + tmp.extra2 = &pid_ns->pid_max; +#endif + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); @@ -302,7 +309,9 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, return ret; }
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE extern int pid_max; +#endif static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", @@ -310,7 +319,11 @@ static struct ctl_table pid_ns_ctl_table[] = { .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, +#ifndef CONFIG_PID_MAX_PER_NAMESPACE .extra2 = &pid_max, +#else + .extra2 = &init_pid_ns.pid_max, +#endif }, { } }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f52..e84df0818517 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1809,6 +1809,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifndef CONFIG_PID_MAX_PER_NAMESPACE { .procname = "pid_max", .data = &pid_max, @@ -1818,6 +1819,7 @@ static struct ctl_table kern_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#endif { .procname = "panic_on_oops", .data = &panic_on_oops, diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 95106d02b32d..8056d7441d05 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -414,7 +414,11 @@ struct trace_pid_list *trace_pid_list_alloc(void) int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */ +#ifndef CONFIG_PID_MAX_PER_NAMESPACE WARN_ON_ONCE(pid_max > (1 << 30)); +#else + WARN_ON_ONCE(task_active_pid_ns(current)->pid_max > (1 << 30)); +#endif
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b656cab67f67..5afa58302b06 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5423,7 +5423,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) { +#ifndef CONFIG_PID_MAX_PER_NAMESPACE tgid_map_max = pid_max; +#else + tgid_map_max = init_pid_ns.pid_max; +#endif map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), GFP_KERNEL);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d608f6128704..52123df262b2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -695,7 +695,9 @@ extern unsigned long tracing_thresh;
/* PID filtering */
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE extern int pid_max; +#endif
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid);