
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICA1GK -------------------------------- Add struct pidns_loadavg to record and track the average load with respect to the tasks within a pid namespace. Use a delayed_work to update this data structure for all pid namespaces with LOAD_FREQ interval. Signed-off-by: GONG Ruiqi <gongruiqi1@huawei.com> --- include/linux/pid.h | 5 ++ include/linux/pid_namespace.h | 18 ++++++ kernel/bpf-rvi/Kconfig | 1 + kernel/pid.c | 10 +++ kernel/pid_namespace.c | 117 ++++++++++++++++++++++++++++++++++ 5 files changed, 151 insertions(+) diff --git a/include/linux/pid.h b/include/linux/pid.h index b90bc447d2a2..9ddee8589956 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -217,4 +217,9 @@ pid_t pid_vnr(struct pid *pid); } \ task = tg___; \ } while_each_pid_task(pid, type, task) + +#ifdef CONFIG_BPF_RVI +struct pidns_loadavg; +extern struct pidns_loadavg init_pidns_loadavg; +#endif #endif /* _LINUX_PID_H */ diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 28161eefca5d..062d6690b69a 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -45,7 +45,11 @@ struct pid_namespace { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif +#ifdef CONFIG_BPF_RVI + KABI_USE(1, struct pidns_loadavg *loadavg) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) } __randomize_layout; @@ -132,6 +136,20 @@ static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) #ifdef CONFIG_BPF_RVI extern struct task_struct *get_current_level1_reaper(void); + +/* + * This struct should be viewed as an extension but not an entity. + * IOW it doesn't hold refcount to struct pid_namespace (but the list does), and + * all its members are semantically embedded in struct pid_namespace. + */ +struct pidns_loadavg { + struct pid_namespace *pidns; + struct list_head list; + unsigned long load_tasks; + unsigned long avenrun[3]; +}; + +extern struct pidns_loadavg init_pidns_loadavg; #endif #endif /* _LINUX_PID_NS_H */ diff --git a/kernel/bpf-rvi/Kconfig b/kernel/bpf-rvi/Kconfig index 8a9cbac36a0c..c1a76498eeee 100644 --- a/kernel/bpf-rvi/Kconfig +++ b/kernel/bpf-rvi/Kconfig @@ -7,6 +7,7 @@ config BPF_RVI depends on BPF_SYSCALL depends on BPF_JIT depends on CPUSETS + depends on PID_NS select BPF_RVI_BLK_BFQ if IOSCHED_BFQ = y # built-in required default n help diff --git a/kernel/pid.c b/kernel/pid.c index 8000cf327985..dd14df48b118 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -93,9 +93,19 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif +#ifdef CONFIG_BPF_RVI + .loadavg = &init_pidns_loadavg, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); +#ifdef CONFIG_BPF_RVI +struct pidns_loadavg init_pidns_loadavg = { + .pidns = &init_pid_ns, + .list = LIST_HEAD_INIT(init_pidns_loadavg.list), +}; +#endif + /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 1180070fc2a0..2e1afc01240a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -22,11 +22,18 @@ #include <linux/export.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> +#ifdef CONFIG_BPF_RVI +#include <linux/sched/loadavg.h> +#endif #include <linux/idr.h> #include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; +#ifdef CONFIG_BPF_RVI +static struct kmem_cache *pidns_loadavg_cachep; +static DEFINE_SPINLOCK(pidns_list_lock); +#endif /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; @@ -116,6 +123,18 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + +#ifdef CONFIG_BPF_RVI + ns->loadavg = kmem_cache_zalloc(pidns_loadavg_cachep, GFP_KERNEL); + if (ns->loadavg == NULL) + goto out_free_idr; + ns->loadavg->pidns = ns; + spin_lock(&pidns_list_lock); + // additional 1 refcount for the list + list_add_tail(&get_pid_ns(ns)->loadavg->list, &init_pidns_loadavg.list); + spin_unlock(&pidns_list_lock); +#endif + return ns; out_free_idr: @@ -142,6 +161,13 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_free_inum(&ns->ns); idr_destroy(&ns->idr); +#ifdef CONFIG_BPF_RVI + /* + * ns->loadavg's lifecycle aligns precisely with ns, + * so don't need RCU delayed free. + */ + kmem_cache_free(pidns_loadavg_cachep, ns->loadavg); +#endif call_rcu(&ns->rcu, delayed_free_pidns); } @@ -481,6 +507,11 @@ const struct proc_ns_operations pidns_for_children_operations = { .get_parent = pidns_get_parent, }; +#ifdef CONFIG_BPF_RVI +static void pidns_calc_loadavg_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(pidns_calc_loadavg_work, pidns_calc_loadavg_workfn); +#endif + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); @@ -490,7 +521,93 @@ static __init int pid_namespaces_init(void) #endif register_pid_ns_sysctl_table_vm(); + +#ifdef CONFIG_BPF_RVI + pidns_loadavg_cachep = KMEM_CACHE(pidns_loadavg, SLAB_PANIC | SLAB_ACCOUNT); + schedule_delayed_work(&pidns_calc_loadavg_work, LOAD_FREQ); +#endif return 0; } __initcall(pid_namespaces_init); + +#ifdef CONFIG_BPF_RVI +static void pidns_list_reset(void) +{ + struct list_head *pos, *tmp; + + spin_lock(&pidns_list_lock); + list_for_each_safe(pos, tmp, &init_pidns_loadavg.list) { + struct pidns_loadavg *entry = list_entry(pos, struct pidns_loadavg, list); + struct pid_namespace *pidns = entry->pidns; + + /* + * Where the actual releasing of pidns is triggered: + * + * refcount == 1 means the pidns is only referred by this list, + * which should be released. + */ + if (refcount_read(&pidns->ns.count) == 1) { + list_del(pos); + put_pid_ns(pidns); + continue; + } + + pidns->loadavg->load_tasks = 0; // reset + } + spin_unlock(&pidns_list_lock); +} + +static void pidns_update_load_tasks(void) +{ + struct task_struct *p, *t; + + rcu_read_lock(); + for_each_process_thread(p, t) { + // exists for sure, don't need get_pid_ns() + struct pid_namespace *pidns = task_active_pid_ns(t); + unsigned int state = READ_ONCE(t->__state) & TASK_REPORT; + + if (state != TASK_UNINTERRUPTIBLE && state != TASK_RUNNING) + continue; + + // Skip calculating init_pid_ns's loadavg. Meaningless. + while (pidns != &init_pid_ns) { + pidns->loadavg->load_tasks += 1; + pidns = pidns->parent; + } + } + rcu_read_unlock(); +} + +static void pidns_calc_avenrun(void) +{ + struct list_head *pos; + + spin_lock(&pidns_list_lock); + /* + * As the loadavg of init_pid_ns is exactly /proc/loadavg, avoid redundant + * re-calculation for init_pid_ns, and reuse init_pidns_loadavg.list as the + * list head. + */ + list_for_each(pos, &init_pidns_loadavg.list) { + struct pidns_loadavg *entry = list_entry(pos, struct pidns_loadavg, list); + long active = entry->load_tasks; + + /* Reference: calc_global_load() */ + active = active > 0 ? active * FIXED_1 : 0; + entry->avenrun[0] = calc_load(entry->avenrun[0], EXP_1, active); + entry->avenrun[1] = calc_load(entry->avenrun[1], EXP_5, active); + entry->avenrun[2] = calc_load(entry->avenrun[2], EXP_15, active); + } + spin_unlock(&pidns_list_lock); +} + +static void pidns_calc_loadavg_workfn(struct work_struct *work) +{ + pidns_list_reset(); + pidns_update_load_tasks(); + pidns_calc_avenrun(); + schedule_delayed_work(&pidns_calc_loadavg_work, LOAD_FREQ); +} +#endif /* CONFIG_BPF_RVI */ -- 2.25.1