From: Li Zefan lizefan@huawei.com
euler inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8TCPY
-------------------------------------------------
The adjusted pid_max on the host also takes effect in the container. Move pid_max to pid_namespace to fix this problem.
Signed-off-by: Li Zefan lizefan@huawei.com Signed-off-by: Yi Yang yiyang13@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 37 +++++++++++++++--- include/linux/pid.h | 2 + include/linux/pid_namespace.h | 3 ++ init/Kconfig | 6 +++ kernel/pid.c | 53 ++++++++++++++++++++++++++ kernel/pid_namespace.c | 13 +++++++ kernel/sysctl.c | 2 + kernel/trace/pid_list.c | 4 ++ kernel/trace/trace.c | 4 ++ kernel/trace/trace.h | 2 + 11 files changed, 122 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9a053bc885dd..a040a64a8350 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -999,6 +999,7 @@ CONFIG_ARCH_USE_QUEUED_RWLOCKS=y CONFIG_QUEUED_RWLOCKS=y CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_PID_MAX_PER_NAMESPACE=y CONFIG_FREEZER=y
# diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 4bf011ed6307..da0f386608d7 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -2,6 +2,7 @@ # Automatically generated file; DO NOT EDIT. # Linux/x86 6.6.0 Kernel Configuration # +CONFIG_TOOLS_SUPPORT_RELR=y CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_TABLE_SORT=y CONFIG_THREAD_INFO_IN_TASK=y @@ -66,7 +67,6 @@ CONFIG_SPARSE_IRQ=y
CONFIG_CLOCKSOURCE_WATCHDOG=y CONFIG_ARCH_CLOCKSOURCE_INIT=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y @@ -90,6 +90,7 @@ CONFIG_CONTEXT_TRACKING_USER=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US=125 +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y # end of Timers subsystem
CONFIG_BPF=y @@ -172,6 +173,7 @@ CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y CONFIG_CC_IMPLICIT_FALLTHROUGH="-Wimplicit-fallthrough=5" CONFIG_GCC11_NO_ARRAY_BOUNDS=y +CONFIG_CC_NO_ARRAY_BOUNDS=y CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y @@ -188,13 +190,13 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_V1_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_QOS_SCHED_PRIO_LB=y CONFIG_FAIR_GROUP_SCHED=y -CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y -CONFIG_SCHED_MM_CID=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +CONFIG_SCHED_MM_CID=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y @@ -287,6 +289,8 @@ CONFIG_PERF_EVENTS=y CONFIG_SYSTEM_DATA_VERIFICATION=y CONFIG_PROFILING=y CONFIG_TRACEPOINTS=y +CONFIG_KABI_RESERVE=y +CONFIG_KABI_SIZE_ALIGN_CHECKS=y
# # Kexec and crash features @@ -512,6 +516,7 @@ CONFIG_LEGACY_VSYSCALL_XONLY=y # CONFIG_CMDLINE_BOOL is not set CONFIG_MODIFY_LDT_SYSCALL=y # CONFIG_STRICT_SIGALTSTACK_SIZE is not set +CONFIG_HAVE_LIVEPATCH_FTRACE=y CONFIG_HAVE_LIVEPATCH_WO_FTRACE=y
# @@ -542,6 +547,7 @@ CONFIG_CALL_DEPTH_TRACKING=y CONFIG_CPU_IBPB_ENTRY=y CONFIG_CPU_IBRS_ENTRY=y CONFIG_CPU_SRSO=y +# CONFIG_SLS is not set # CONFIG_GDS_FORCE_MITIGATION is not set CONFIG_ARCH_HAS_ADD_PAGES=y
@@ -927,6 +933,7 @@ CONFIG_MODULE_SIG_ALL=y CONFIG_MODULE_SIG_SHA256=y # CONFIG_MODULE_SIG_SHA384 is not set # CONFIG_MODULE_SIG_SHA512 is not set +# CONFIG_MODULE_SIG_SM3 is not set CONFIG_MODULE_SIG_HASH="sha256" CONFIG_MODULE_COMPRESS_NONE=y # CONFIG_MODULE_COMPRESS_GZIP is not set @@ -944,6 +951,7 @@ CONFIG_BLK_ICQ=y CONFIG_BLK_DEV_BSGLIB=y CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_INTEGRITY_T10=m +CONFIG_BLK_DEV_WRITE_MOUNTED=y CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y # CONFIG_BLK_DEV_THROTTLING_LOW is not set @@ -1015,6 +1023,7 @@ CONFIG_QUEUED_RWLOCKS=y CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_PID_MAX_PER_NAMESPACE=y CONFIG_FREEZER=y
# @@ -1099,6 +1108,7 @@ CONFIG_DEVICE_MIGRATION=y CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y CONFIG_ARCH_ENABLE_THP_MIGRATION=y CONFIG_CONTIG_ALLOC=y +CONFIG_PCP_BATCH_SCALE_MAX=5 CONFIG_PHYS_ADDR_T_64BIT=y CONFIG_MMU_NOTIFIER=y CONFIG_KSM=y @@ -1155,6 +1165,8 @@ CONFIG_LRU_GEN=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y +# CONFIG_ASCEND_OOM is not set +# CONFIG_PAGE_CACHE_LIMIT is not set
# # Data Access Monitoring @@ -1949,6 +1961,7 @@ CONFIG_PAGE_POOL=y # CONFIG_PAGE_POOL_STATS is not set CONFIG_FAILOVER=m CONFIG_ETHTOOL_NETLINK=y +CONFIG_NETACC_BPF=y
# # Device Drivers @@ -2743,6 +2756,7 @@ CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m CONFIG_NET_TEAM_MODE_LOADBALANCE=m CONFIG_MACVLAN=m CONFIG_MACVTAP=m +# CONFIG_IPVLAN_L2E is not set CONFIG_IPVLAN_L3S=y CONFIG_IPVLAN=m CONFIG_IPVTAP=m @@ -5636,6 +5650,7 @@ CONFIG_FB_HYPERV=m # CONFIG_FB_SIMPLE is not set # CONFIG_FB_SSD1307 is not set # CONFIG_FB_SM712 is not set +# CONFIG_FB_LS2K500 is not set CONFIG_FB_CORE=y CONFIG_FB_NOTIFY=y # CONFIG_FIRMWARE_EDID is not set @@ -7978,6 +7993,12 @@ CONFIG_INTEL_TH_PTI=m # CONFIG_MOST is not set # CONFIG_PECI is not set # CONFIG_HTE is not set + +# +# CPU Inspect +# +# CONFIG_CPU_INSPECT is not set +# end of CPU Inspect # end of Device Drivers
# @@ -8300,6 +8321,7 @@ CONFIG_NLS_UCS2_UTILS=m CONFIG_DLM=m CONFIG_DLM_DEBUG=y CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set CONFIG_IO_WQ=y # end of File systems
@@ -8407,6 +8429,7 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,appar CONFIG_INIT_STACK_NONE=y # CONFIG_INIT_ON_ALLOC_DEFAULT_ON is not set # CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +# CONFIG_ZERO_CALL_USED_REGS is not set # end of Memory initialization
# @@ -8418,6 +8441,8 @@ CONFIG_LIST_HARDENED=y
CONFIG_RANDSTRUCT_NONE=y # end of Kernel hardening options + +# CONFIG_SECURITY_BOOT_INIT is not set # end of Security options
CONFIG_XOR_BLOCKS=m @@ -8889,6 +8914,7 @@ CONFIG_DEBUG_INFO_COMPRESSED_NONE=y # CONFIG_DEBUG_INFO_SPLIT is not set CONFIG_DEBUG_INFO_BTF=y CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_LANG_EXCLUDE=y CONFIG_DEBUG_INFO_BTF_MODULES=y # CONFIG_MODULE_ALLOW_BTF_MISMATCH is not set # CONFIG_GDB_SCRIPTS is not set @@ -8928,6 +8954,8 @@ CONFIG_ARCH_HAS_EARLY_DEBUG=y CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y # CONFIG_UBSAN is not set CONFIG_HAVE_ARCH_KCSAN=y +CONFIG_HAVE_KCSAN_COMPILER=y +# CONFIG_KCSAN is not set # end of Generic Kernel Debugging Instruments
# @@ -8988,6 +9016,7 @@ CONFIG_LOCKUP_DETECTOR=y CONFIG_SOFTLOCKUP_DETECTOR=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_HAVE_HARDLOCKUP_DETECTOR_BUDDY=y +# CONFIG_SDEI_WATCHDOG is not set CONFIG_HARDLOCKUP_DETECTOR=y # CONFIG_HARDLOCKUP_DETECTOR_PREFER_BUDDY is not set CONFIG_HARDLOCKUP_DETECTOR_PERF=y @@ -9205,5 +9234,3 @@ CONFIG_ARCH_USE_MEMTEST=y # # end of Rust hacking # end of Kernel hacking -CONFIG_KABI_SIZE_ALIGN_CHECKS=y -CONFIG_KABI_RESERVE=y diff --git a/include/linux/pid.h b/include/linux/pid.h index 653a527574c4..b90bc447d2a2 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -113,8 +113,10 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new, struct pid_namespace; extern struct pid_namespace init_pid_ns;
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE extern int pid_max; extern int pid_max_min, pid_max_max; +#endif
/* * look up a PID in the hash table. Must be called with the tasklist_lock diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f9f9931e02d6..0a7c6dab0b7b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -36,6 +36,9 @@ struct pid_namespace { #endif struct user_namespace *user_ns; struct ucounts *ucounts; +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + int pid_max; +#endif int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) diff --git a/init/Kconfig b/init/Kconfig index 869eea4108d0..d52550d35427 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2104,3 +2104,9 @@ config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE # <asm/syscall_wrapper.h>. config ARCH_HAS_SYSCALL_WRAPPER def_bool n + +config PID_MAX_PER_NAMESPACE + bool "Make pid_max per namespace" + default n + help + Say Y here to enable make pid_max per namespace. diff --git a/kernel/pid.c b/kernel/pid.c index 383abde0c208..b6791a8a211d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -59,12 +59,19 @@ struct pid init_struct_pid = { }, } };
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE int pid_max = PID_MAX_DEFAULT; +#endif
#define RESERVED_PIDS 300
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#else +static int pid_max_min = RESERVED_PIDS + 1; +static int pid_max_max = PID_MAX_LIMIT; +#endif
/* * PID-map pages start out as NULL, they get allocated upon @@ -80,6 +87,9 @@ struct pid_namespace init_pid_ns = { .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + .pid_max = PID_MAX_DEFAULT, +#endif #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif @@ -194,7 +204,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tid = set_tid[ns->level - i];
retval = -EINVAL; +#ifndef CONFIG_PID_MAX_PER_NAMESPACE if (tid < 1 || tid >= pid_max) +#else + if (tid < 1 || tid >= task_active_pid_ns(current)->pid_max) +#endif goto out_free; /* * Also fail if a PID != 1 is requested and @@ -234,7 +248,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, +#ifndef CONFIG_PID_MAX_PER_NAMESPACE pid_max, GFP_ATOMIC); +#else + tmp->pid_max, GFP_ATOMIC); +#endif } spin_unlock_irq(&pidmap_lock); idr_preload_end(); @@ -646,8 +664,37 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return fd; }
+#ifdef CONFIG_PID_MAX_PER_NAMESPACE +static int proc_dointvec_pidmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp; + + tmp = *table; + tmp.data = &task_active_pid_ns(current)->pid_max; + + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ctl_table[] = { + { + .procname = "pid_max", + .data = &init_pid_ns.pid_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_pidmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + {} +}; +#endif + void __init pid_idr_init(void) { +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + int pid_max = init_pid_ns.pid_max; +#endif /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
@@ -658,6 +705,9 @@ void __init pid_idr_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+#ifdef CONFIG_PID_MAX_PER_NAMESPACE + init_pid_ns.pid_max = pid_max; +#endif idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = kmem_cache_create("pid", @@ -665,6 +715,9 @@ void __init pid_idr_init(void) __alignof__(struct pid), SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + register_sysctl_init("kernel", pid_ctl_table); +#endif }
static struct file *__pidfd_fget(struct task_struct *task, int fd) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 619972c78774..c837b1096dcb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -110,6 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + ns->pid_max = parent_pid_ns->pid_max; +#endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif @@ -295,6 +298,10 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next; +#ifdef CONFIG_PID_MAX_PER_NAMESPACE + tmp.extra2 = &pid_ns->pid_max; +#endif + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); @@ -302,7 +309,9 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, return ret; }
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE extern int pid_max; +#endif static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", @@ -310,7 +319,11 @@ static struct ctl_table pid_ns_ctl_table[] = { .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, +#ifndef CONFIG_PID_MAX_PER_NAMESPACE .extra2 = &pid_max, +#else + .extra2 = &init_pid_ns.pid_max, +#endif }, { } }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f52..e84df0818517 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1809,6 +1809,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifndef CONFIG_PID_MAX_PER_NAMESPACE { .procname = "pid_max", .data = &pid_max, @@ -1818,6 +1819,7 @@ static struct ctl_table kern_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#endif { .procname = "panic_on_oops", .data = &panic_on_oops, diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 95106d02b32d..8056d7441d05 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -414,7 +414,11 @@ struct trace_pid_list *trace_pid_list_alloc(void) int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */ +#ifndef CONFIG_PID_MAX_PER_NAMESPACE WARN_ON_ONCE(pid_max > (1 << 30)); +#else + WARN_ON_ONCE(task_active_pid_ns(current)->pid_max > (1 << 30)); +#endif
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b656cab67f67..5afa58302b06 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5423,7 +5423,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) { +#ifndef CONFIG_PID_MAX_PER_NAMESPACE tgid_map_max = pid_max; +#else + tgid_map_max = init_pid_ns.pid_max; +#endif map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), GFP_KERNEL);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d608f6128704..52123df262b2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -695,7 +695,9 @@ extern unsigned long tracing_thresh;
/* PID filtering */
+#ifndef CONFIG_PID_MAX_PER_NAMESPACE extern int pid_max; +#endif
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid);