Hi Xinnpeng and Chengjian,
Thanks for your patchset.
这组 Patch 前前后后 Review 了挺长时间了,对 KABI 的修复是我最担心的。 PATCH v10 的改法中,将 psi 放在了结构体的前面。一旦代码中出现对 cgrp 的 free 操作,便会出问题。虽然 patch 中,修改了两处 free 的位置。 但是我仍然担心,相关代码后续如有修改,会存在隐患。
+struct cgroup_psi {
- /* used to track pressure stalls */
- struct psi_group psi;
- struct cgroup cgrp;
+};
与 cgroup 组的同学讨论,struct cgroup 的申请释放使用都是在 kernel 中实现的, 不存在模块中直接使用的情况。而且参考了业界 OS (CENTOS/RHEL) ,针对 psi 也是这样规避的。所以,我也觉得,这种方式更好一些。
struct cgroup { ... /* * RHEL8: * The cgroup structures are all allocated by the core kernel * code at run time. It is also accessed only the cgroup core code * and so changes made to the cgroup structure should not affect * third-party kernel modules. However, a number of important kernel * data structures do contain pointer to a cgroup structure and so * the kABI signature has to be maintained. * * The ancestor_ids[] arrary has to be at the end of structure. */ RH_KABI_BROKEN_INSERT_BLOCK( struct cgroup *old_dom_cgrp; /* used while enabling threaded */
/* Used to store internal freezer state */ struct cgroup_freezer_state freezer;
/* used to track pressure stalls */ struct psi_group psi; ) /* RH_KABI_BROKEN_INSERT_BLOCK */
/* * RHEL8: * The ancestor_ids[] should only be used by cgroup core. * External kernel modules should not used it. */
所以,我觉得在 openEuler 中这样修改应该就可以,
struct cgroup { ... #ifndef __GENKSYMS__ /* used to track pressure stalls */ struct psi_group psi; #endif /* ids of the ancestors at each level including self */ int ancestor_ids[]; };
On 2021/12/8 17:44, Liu Xinpeng wrote:
Kabi compatibility with old third modules
openEuler inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn # openEuler_contributor Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn # openEuler_contributor Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com
include/linux/cgroup-defs.h | 14 +++++++++++--- include/linux/cgroup.h | 4 +++- include/linux/kernfs.h | 8 +++++--- include/linux/kthread.h | 3 +++ include/linux/mmzone.h | 1 - include/linux/page-flags.h | 6 ++++-- include/linux/sched.h | 14 +++++++++----- include/linux/swap.h | 3 ++- kernel/cgroup/cgroup.c | 25 ++++++++++++++++++++++--- kernel/kthread.c | 2 ++ kernel/sched/psi.c | 8 ++++++-- mm/filemap.c | 2 +- mm/memcontrol.c | 2 -- mm/vmstat.c | 1 - mm/workingset.c | 39 +++++++++++++++++++++++++++++++-------- 15 files changed, 99 insertions(+), 33 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c026665..c94c2bf 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -443,9 +443,6 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work;
- /* used to track pressure stalls */
- struct psi_group psi;
- /* used to store eBPF programs */ struct cgroup_bpf bpf;
@@ -456,6 +453,17 @@ struct cgroup { int ancestor_ids[]; };
+struct cgroup_psi {
- /* used to track pressure stalls */
- struct psi_group psi;
- struct cgroup cgrp;
+};
+static inline struct cgroup_psi *get_cgroup_psi(struct cgroup *cgrp) +{
- return container_of(cgrp, struct cgroup_psi, cgrp);
+}
/*
- A cgroup_root represents the root of a cgroup hierarchy, and may be
- associated with a kernfs_root to form an active hierarchy. This is
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 73acf8b..1b68815 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -671,7 +671,9 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) {
- return &cgrp->psi;
- struct cgroup_psi *cgroup = get_cgroup_psi(cgrp);
- return &cgroup->psi;
}
static inline void cgroup_init_kthreadd(void) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index ab81a22..4d3ddaa 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -269,16 +269,18 @@ struct kernfs_ops { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off);
- __poll_t (*poll)(struct kernfs_open_file *of,
struct poll_table_struct *pt);
- int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
#ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif
+#ifndef __GENKSYMS__
- __poll_t (*poll)(struct kernfs_open_file *of,
struct poll_table_struct *pt);
+#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) };
diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 206f0c3..de1b645 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,6 +4,9 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include <linux/err.h> #include <linux/sched.h> +#ifdef __GENKSYMS__ +#include <linux/cgroup.h> +#endif
__printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b55dd22..e7d2bca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -164,7 +164,6 @@ enum node_stat_item { NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE,
- WORKINGSET_RESTORE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 97c0918..dcafd8c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -69,14 +69,13 @@ */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */
- PG_error, PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_active,
- PG_workingset, PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
- PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1,
@@ -103,6 +102,9 @@ enum pageflags { PG_idle, #endif PG_percpu_ref, +#ifndef __GENKSYMS__
- PG_workingset,
+#endif __NR_PAGEFLAGS,
/* Filesystems */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d137ff5..1c254d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -723,7 +723,7 @@ struct task_struct { unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_remote_wakeup:1; -#ifdef CONFIG_PSI +#if (defined(CONFIG_PSI) && !defined(__GENKSYMS__)) unsigned sched_psi_wake_requeue:1; #endif
@@ -980,10 +980,6 @@ struct task_struct { siginfo_t *last_siginfo;
struct task_io_accounting ioac; -#ifdef CONFIG_PSI
- /* Pressure stall state */
- unsigned int psi_flags;
-#endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; @@ -1255,7 +1251,15 @@ struct task_struct { #else KABI_RESERVE(5) #endif +#if (defined(CONFIG_PSI) && !defined(__GENKSYMS__))
- /* Pressure stall state */
- union {
unsigned int psi_flags;
unsigned long psi_flags_padding;
- };
+#else KABI_RESERVE(6) +#endif KABI_RESERVE(7) KABI_RESERVE(8)
diff --git a/include/linux/swap.h b/include/linux/swap.h index 12a39b2..cab2987 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -322,7 +322,8 @@ struct vma_swap_readahead {
/* linux/mm/workingset.c */ void *workingset_eviction(struct address_space *mapping, struct page *page); -void workingset_refault(struct page *page, void *shadow); +bool workingset_refault(void *shadow); +void workingset_refault_fixkabi(struct page *page, void *shadow); void workingset_activation(struct page *page);
/* Do not use directly, use workingset_lookup_update */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8066faa..2cb08ce 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3434,21 +3434,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
struct psi_group *psi = cgroup->id == 1 ? &psi_system : cgroup_psi(cgroup);
return psi_show(seq, psi, PSI_IO);
} static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
struct psi_group *psi = cgroup->id == 1 ? &psi_system : cgroup_psi(cgroup);
return psi_show(seq, psi, PSI_MEM);
} static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
struct psi_group *psi = cgroup->id == 1 ? &psi_system : cgroup_psi(cgroup);
return psi_show(seq, psi, PSI_CPU);
} @@ -4712,7 +4712,11 @@ static void css_free_rwork_fn(struct work_struct *work) psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); +#ifdef CONFIG_PSI
kfree(get_cgroup_psi(cgrp));
+#else kfree(cgrp); +#endif } else { /* * This is root cgroup's refcnt reaching zero, @@ -4933,15 +4937,26 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, static struct cgroup *cgroup_create(struct cgroup *parent) { struct cgroup_root *root = parent->root; +#ifdef CONFIG_PSI
- struct cgroup_psi *psi_cgrp;
+#endif struct cgroup *cgrp, *tcgrp; int level = parent->level + 1; int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */ +#ifdef CONFIG_PSI
- psi_cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)) + offsetof(struct cgroup_psi, cgrp), GFP_KERNEL);
- if (!psi_cgrp)
return ERR_PTR(-ENOMEM);
- cgrp = &psi_cgrp->cgrp;
+#else cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); +#endif
ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -5025,7 +5040,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: +#ifdef CONFIG_PSI
- kfree(psi_cgrp);
+#else kfree(cgrp); +#endif return ERR_PTR(ret); }
diff --git a/kernel/kthread.c b/kernel/kthread.c index da07863..79d068e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,7 +11,9 @@ #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> +#ifndef __GENKSYMS__ #include <linux/cgroup.h> +#endif #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 35b8c79..ed9eeb9 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -650,11 +650,13 @@ void psi_memstall_leave(unsigned long *flags) }
#ifdef CONFIG_CGROUPS -int psi_cgroup_alloc(struct cgroup *cgroup) +int psi_cgroup_alloc(struct cgroup *cgrp) {
struct cgroup_psi *cgroup; if (static_branch_likely(&psi_disabled)) return 0;
cgroup = get_cgroup_psi(cgrp); cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); if (!cgroup->psi.pcpu) return -ENOMEM;
@@ -662,11 +664,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup) return 0; }
-void psi_cgroup_free(struct cgroup *cgroup) +void psi_cgroup_free(struct cgroup *cgrp) {
struct cgroup_psi *cgroup; if (static_branch_likely(&psi_disabled)) return;
cgroup = get_cgroup_psi(cgrp); cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu);
} diff --git a/mm/filemap.c b/mm/filemap.c index 66460c9..3df50ba 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1026,7 +1026,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, */ WARN_ON_ONCE(PageActive(page)); if (!(gfp_mask & __GFP_WRITE) && shadow)
workingset_refault(page, shadow);
if (!PagePercpuRef(page)) lru_cache_add(page); }workingset_refault_fixkabi(page, shadow);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46596d2..0bccf2b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6130,8 +6130,6 @@ static int memory_stat_show(struct seq_file *m, void *v) memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_printf(m, "workingset_activate %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE));
- seq_printf(m, "workingset_restore %lu\n",
seq_printf(m, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM));memcg_page_state(memcg, WORKINGSET_RESTORE));
diff --git a/mm/vmstat.c b/mm/vmstat.c index e183cb2..96028cc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1145,7 +1145,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) "nr_isolated_file", "workingset_refault", "workingset_activate",
- "workingset_restore", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped",
diff --git a/mm/workingset.c b/mm/workingset.c index 039eea7..ea37998 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -242,13 +242,14 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
/**
- workingset_refault - evaluate the refault of a previously evicted page
- @page: the freshly allocated replacement page
- @shadow: shadow entry of the evicted page
- Calculates and evaluates the refault distance of the previously
- evicted page in the context of the node it was allocated in.
*/
- Returns %true if the page should be activated, %false otherwise.
-void workingset_refault(struct page *page, void *shadow) +bool workingset_refault(void *shadow) { unsigned long refault_distance; struct pglist_data *pgdat; @@ -258,6 +259,7 @@ void workingset_refault(struct page *page, void *shadow) struct lruvec *lruvec; unsigned long refault; bool workingset;
bool ret = false; int memcgid;
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
@@ -313,17 +315,38 @@ void workingset_refault(struct page *page, void *shadow) if (refault_distance > active_file) goto out;
- SetPageActive(page);
- ret = true; atomic_long_inc(&lruvec->inactive_age); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
- /* Page was active prior to eviction */
- if (workingset) {
SetPageWorkingset(page);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
- }
out: rcu_read_unlock();
- return ret;
+}
+/**
- workingset_refault_fixkabi - evaluate the refault of a previously
- evicted page
- @page: the freshly allocated replacement page
- @shadow: shadow entry of the evicted page
- Calculates and evaluates the refault distance of the previously
- evicted page in the context of the node it was allocated in.
- */
+void workingset_refault_fixkabi(struct page *page, void *shadow) +{
- unsigned long entry = (unsigned long)shadow;
- bool workingset;
- entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
- workingset = entry & 1;
- if (workingset_refault(shadow)) {
SetPageActive(page);
if (workingset)
SetPageWorkingset(page);
- }
}
/**