From: yuzhoujian yuzhoujian@didichuxing.com
mainline inclusion from mainline-5.0-rc1 commit ef8444ea01d7442652f8e1b8a8b94278cb57eafd category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- OOM report contains several sections. The first one is the allocation context that has triggered the OOM. Then we have cpuset context followed by the stack trace of the OOM path. The tird one is the OOM memory information. Followed by the current memory state of all system tasks. At last, we will show oom eligible tasks and the information about the chosen oom victim.
One thing that makes parsing more awkward than necessary is that we do not have a single and easily parsable line about the oom context. This patch is reorganizing the oom report to
1) who invoked oom and what was the allocation request
[ 515.902945] tuned invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
2) OOM stack trace
[ 515.904273] CPU: 24 PID: 1809 Comm: tuned Not tainted 4.20.0-rc3+ #3 [ 515.905518] Hardware name: Inspur SA5212M4/YZMB-00370-107, BIOS 4.1.10 11/14/2016 [ 515.906821] Call Trace: [ 515.908062] dump_stack+0x5a/0x73 [ 515.909311] dump_header+0x55/0x28c [ 515.914260] oom_kill_process+0x2d8/0x300 [ 515.916708] out_of_memory+0x145/0x4a0 [ 515.917932] __alloc_pages_slowpath+0x7d2/0xa16 [ 515.919157] __alloc_pages_nodemask+0x277/0x290 [ 515.920367] filemap_fault+0x3d0/0x6c0 [ 515.921529] ? filemap_map_pages+0x2b8/0x420 [ 515.922709] ext4_filemap_fault+0x2c/0x40 [ext4] [ 515.923884] __do_fault+0x20/0x80 [ 515.925032] __handle_mm_fault+0xbc0/0xe80 [ 515.926195] handle_mm_fault+0xfa/0x210 [ 515.927357] __do_page_fault+0x233/0x4c0 [ 515.928506] do_page_fault+0x32/0x140 [ 515.929646] ? page_fault+0x8/0x30 [ 515.930770] page_fault+0x1e/0x30
3) OOM memory information
[ 515.958093] Mem-Info: [ 515.959647] active_anon:26501758 inactive_anon:1179809 isolated_anon:0 active_file:4402672 inactive_file:483963 isolated_file:1344 unevictable:0 dirty:4886753 writeback:0 unstable:0 slab_reclaimable:148442 slab_unreclaimable:18741 mapped:1347 shmem:1347 pagetables:58669 bounce:0 free:88663 free_pcp:0 free_cma:0 ...
4) current memory state of all system tasks
[ 516.079544] [ 744] 0 744 9211 1345 114688 82 0 systemd-journal [ 516.082034] [ 787] 0 787 31764 0 143360 92 0 lvmetad [ 516.084465] [ 792] 0 792 10930 1 110592 208 -1000 systemd-udevd [ 516.086865] [ 1199] 0 1199 13866 0 131072 112 -1000 auditd [ 516.089190] [ 1222] 0 1222 31990 1 110592 157 0 smartd [ 516.091477] [ 1225] 0 1225 4864 85 81920 43 0 irqbalance [ 516.093712] [ 1226] 0 1226 52612 0 258048 426 0 abrtd [ 516.112128] [ 1280] 0 1280 109774 55 299008 400 0 NetworkManager [ 516.113998] [ 1295] 0 1295 28817 37 69632 24 0 ksmtuned [ 516.144596] [ 10718] 0 10718 2622484 1721372 15998976 267219 0 panic [ 516.145792] [ 10719] 0 10719 2622484 1164767 9818112 53576 0 panic [ 516.146977] [ 10720] 0 10720 2622484 1174361 9904128 53709 0 panic [ 516.148163] [ 10721] 0 10721 2622484 1209070 10194944 54824 0 panic [ 516.149329] [ 10722] 0 10722 2622484 1745799 14774272 91138 0 panic
5) oom context (contrains and the chosen victim).
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0-1,task=panic,pid=10737,uid=0
An admin can easily get the full oom context at a single line which makes parsing much easier.
Link: http://lkml.kernel.org/r/1542799799-36184-1-git-send-email-ufo19890607@gmail... Signed-off-by: yuzhoujian yuzhoujian@didichuxing.com Acked-by: Michal Hocko mhocko@suse.com Cc: Andrea Arcangeli aarcange@redhat.com Cc: David Rientjes rientjes@google.com Cc: "Kirill A . Shutemov" kirill.shutemov@linux.intel.com Cc: Roman Gushchin guro@fb.com Cc: Tetsuo Handa penguin-kernel@i-love.sakura.ne.jp Cc: Yang Shi yang.s@alibaba-inc.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit ef8444ea01d7442652f8e1b8a8b94278cb57eafd) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit 985eab72d54b5ac73189d609486526b5e30125ac) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/oom.h | 10 ++++++++++ kernel/cgroup/cpuset.c | 4 ++-- mm/oom_kill.c | 29 ++++++++++++++++++++--------- mm/page_alloc.c | 4 ++-- 4 files changed, 34 insertions(+), 13 deletions(-)
diff --git a/include/linux/oom.h b/include/linux/oom.h index 2b2a40cc19bf4..123538b89dc8d 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -15,6 +15,13 @@ struct notifier_block; struct mem_cgroup; struct task_struct;
+enum oom_constraint { + CONSTRAINT_NONE, + CONSTRAINT_CPUSET, + CONSTRAINT_MEMORY_POLICY, + CONSTRAINT_MEMCG, +}; + /* * Details of the page allocation that triggered the oom killer that are used to * determine what should be killed. @@ -42,6 +49,9 @@ struct oom_control { unsigned long totalpages; struct task_struct *chosen; unsigned long chosen_points; + + /* Used to print the constraint info. */ + enum oom_constraint constraint; };
extern struct mutex oom_lock; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a4ce9474a078c..feb91177247c8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2681,9 +2681,9 @@ void cpuset_print_current_mems_allowed(void) rcu_read_lock();
cgrp = task_cs(current)->css.cgroup; - pr_info("%s cpuset=", current->comm); + pr_cont(",cpuset="); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%*pbl\n", + pr_cont(",mems_allowed=%*pbl", nodemask_pr_args(¤t->mems_allowed));
rcu_read_unlock(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2e09b03432c08..51104048b4e7f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -250,11 +250,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, return points > 0 ? points : 1; }
-enum oom_constraint { - CONSTRAINT_NONE, - CONSTRAINT_CPUSET, - CONSTRAINT_MEMORY_POLICY, - CONSTRAINT_MEMCG, +static const char * const oom_constraint_text[] = { + [CONSTRAINT_NONE] = "CONSTRAINT_NONE", + [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", + [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", + [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", };
/* @@ -483,16 +483,25 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); }
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) +{ + /* one line summary of the oom killer context. */ + pr_info("oom-kill:constraint=%s,nodemask=%*pbl", + oom_constraint_text[oc->constraint], + nodemask_pr_args(oc->nodemask)); + cpuset_print_current_mems_allowed(); + pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, + from_kuid(&init_user_ns, task_uid(victim))); +} + static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, - nodemask_pr_args(oc->nodemask), oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n");
- cpuset_print_current_mems_allowed(); dump_stack(); if (is_memcg_oom(oc)) mem_cgroup_print_oom_info(oc->memcg, p); @@ -503,6 +512,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) } if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); + if (p) + dump_oom_summary(oc, p); }
/* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6bf3d4461430e..2b5a60a856680 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3529,13 +3529,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", current->comm, &vaf, gfp_mask, &gfp_mask, nodemask_pr_args(nodemask)); va_end(args);
cpuset_print_current_mems_allowed(); - + pr_cont("\n"); dump_stack(); warn_alloc_show_mem(gfp_mask, nodemask); }
From: Yafang Shao laoar.shao@gmail.com
mainline inclusion from mainline-5.2-rc7 commit 432b1de0de02a83f64695e69a2d83cbee10c236f category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- In dump_oom_summary() oc->constraint is used to show oom_constraint_text, but it hasn't been set before. So the value of it is always the default value 0. We should inititialize it before.
Bellow is the output when memcg oom occurs,
before this patch: oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null), cpuset=/,mems_allowed=0,oom_memcg=/foo,task_memcg=/foo,task=bash,pid=7997,uid=0
after this patch: oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null), cpuset=/,mems_allowed=0,oom_memcg=/foo,task_memcg=/foo,task=bash,pid=13681,uid=0
Link: http://lkml.kernel.org/r/1560522038-15879-1-git-send-email-laoar.shao@gmail.... Fixes: ef8444ea01d7 ("mm, oom: reorganize the oom report in dump_header") Signed-off-by: Yafang Shao laoar.shao@gmail.com Acked-by: Michal Hocko mhocko@suse.com Cc: Wind Yu yuzhoujian@didichuxing.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 432b1de0de02a83f64695e69a2d83cbee10c236f) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/oom_kill.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 51104048b4e7f..c4e5445dc0c48 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1080,8 +1080,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -static void check_panic_on_oom(struct oom_control *oc, - enum oom_constraint constraint) +static void check_panic_on_oom(struct oom_control *oc) { if (likely(!sysctl_panic_on_oom)) return; @@ -1091,7 +1090,7 @@ static void check_panic_on_oom(struct oom_control *oc, * does not panic for cpuset, mempolicy, or memcg allocation * failures. */ - if (constraint != CONSTRAINT_NONE) + if (oc->constraint != CONSTRAINT_NONE) return; } /* Do not panic for oom kills triggered by sysrq */ @@ -1168,7 +1167,6 @@ EXPORT_SYMBOL_GPL(unregister_hisi_oom_notifier); bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; - enum oom_constraint constraint = CONSTRAINT_NONE; #ifdef CONFIG_ASCEND_OOM unsigned long oom_type; #endif @@ -1222,10 +1220,10 @@ bool out_of_memory(struct oom_control *oc) * Check if there were limitations on the allocation (only relevant for * NUMA and memcg) that may require different handling. */ - constraint = constrained_alloc(oc); - if (constraint != CONSTRAINT_MEMORY_POLICY) + oc->constraint = constrained_alloc(oc); + if (oc->constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; - check_panic_on_oom(oc, constraint); + check_panic_on_oom(oc);
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
From: yuzhoujian yuzhoujian@didichuxing.com
mainline inclusion from mainline-5.0-rc1 commit f0c867d9588d9efc10d6a55009c9560336673369 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- The current oom report doesn't display victim's memcg context during the global OOM situation. While this information is not strictly needed, it can be really helpful for containerized environments to locate which container has lost a process. Now that we have a single line for the oom context, we can trivially add both the oom memcg (this can be either global_oom or a specific memcg which hits its hard limits) and task_memcg which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()] Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail... Signed-off-by: yuzhoujian yuzhoujian@didichuxing.com Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Acked-by: Michal Hocko mhocko@suse.com Cc: David Rientjes rientjes@google.com Cc: "Kirill A . Shutemov" kirill.shutemov@linux.intel.com Cc: Andrea Arcangeli aarcange@redhat.com Cc: Tetsuo Handa penguin-kernel@i-love.sakura.ne.jp Cc: Roman Gushchin guro@fb.com Cc: Yang Shi yang.s@alibaba-inc.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit f0c867d9588d9efc10d6a55009c9560336673369) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked form commit cd2daf20418aa32ce8f81916c073a1ad459c8fac) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 11 +++++++++-- mm/memcontrol.c | 33 ++++++++++++++++++++------------- mm/oom_kill.c | 3 ++- 3 files changed, 31 insertions(+), 16 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 52604319712b9..cdc9109f63d90 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -554,9 +554,11 @@ void mem_cgroup_handle_over_high(void);
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, +void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p);
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); + static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); @@ -1001,7 +1003,12 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) }
static inline void -mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) +{ +} + +static inline void +mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b42d615fa8479..78b203e4b0d79 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1316,32 +1316,39 @@ static const char *const memcg1_stat_names[] = {
#define K(x) ((x) << (PAGE_SHIFT-10)) /** - * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. + * mem_cgroup_print_oom_context: Print OOM information relevant to + * memory controller. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is * enabled */ -void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { - struct mem_cgroup *iter; - unsigned int i; - rcu_read_lock();
+ if (memcg) { + pr_cont(",oom_memcg="); + pr_cont_cgroup_path(memcg->css.cgroup); + } else + pr_cont(",global_oom"); if (p) { - pr_info("Task in "); + pr_cont(",task_memcg="); pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); - } else { - pr_info("Memory limit reached of cgroup "); } - - pr_cont_cgroup_path(memcg->css.cgroup); - pr_cont("\n"); - rcu_read_unlock(); +} + +/** + * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to + * memory controller. + * @memcg: The memory cgroup that went over limit + */ +void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + unsigned int i;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c4e5445dc0c48..d422223d2d6bf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -490,6 +490,7 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) oom_constraint_text[oc->constraint], nodemask_pr_args(oc->nodemask)); cpuset_print_current_mems_allowed(); + mem_cgroup_print_oom_context(oc->memcg, victim); pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, from_kuid(&init_user_ns, task_uid(victim))); } @@ -504,7 +505,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
dump_stack(); if (is_memcg_oom(oc)) - mem_cgroup_print_oom_info(oc->memcg, p); + mem_cgroup_print_oom_meminfo(oc->memcg); else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); if (is_dump_unreclaim_slabs())
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-5.2-rc1 commit 22796c844fcb85f3b289c0e698713b7fa4d9c178 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- Instead of adding up the node counters, use memcg_page_state() to get the memcg state directly. This is a bit cheaper and more stream-lined.
Link: http://lkml.kernel.org/r/20190228163020.24100-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 22796c844fcb85f3b289c0e698713b7fa4d9c178) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit 606207196d565752d874ad8b47c182878d0169da) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78b203e4b0d79..c4ebc6e16de3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -740,10 +740,13 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { unsigned long nr = 0; - int nid; + enum lru_list lru;
- for_each_node_state(nid, N_MEMORY) - nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + } return nr; }
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-5.2-rc1 commit 1a61ab8038e724a6d8aa59e7d4931a119483294d category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- Instead of adding up the zone counters, use lruvec_page_state() to get the node state directly. This is a bit cheaper and more stream-lined.
Link: http://lkml.kernel.org/r/20190228163020.24100-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 1a61ab8038e724a6d8aa59e7d4931a119483294d) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit 15181773808055a32fc1cbbe6f6b44761267e536) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 18 ------------------ mm/memcontrol.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 2 insertions(+), 20 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cdc9109f63d90..4795f9ee72124 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -527,19 +527,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask);
-static inline -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - struct mem_cgroup_per_node *mz; - unsigned long nr_pages = 0; - int zid; - - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for (zid = 0; zid < MAX_NR_ZONES; zid++) - nr_pages += mz->lru_zone_size[zid][lru]; - return nr_pages; -} - static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -978,11 +965,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return true; }
-static inline unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - return 0; -} static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c4ebc6e16de3d..47a5e27d55278 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -731,7 +731,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += mem_cgroup_get_lru_size(lruvec, lru); + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); } return nr; } diff --git a/mm/vmscan.c b/mm/vmscan.c index b4850580770cb..d2c268ca401c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -356,7 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone int zid;
if (!mem_cgroup_disabled()) - lru_size = mem_cgroup_get_lru_size(lruvec, lru); + lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru); else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
From: Johannes Weiner jweiner@fb.com
mainline inclusion from mainline-4.20-rc1 commit 95f9ab2d596e8cbb388315e78c82b9a131bf2928 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- Patch series "psi: pressure stall information for CPU, memory, and IO", v4.
Overview
PSI reports the overall wallclock time in which the tasks in a system (or cgroup) wait for (contended) hardware resources.
This helps users understand the resource pressure their workloads are under, which allows them to rootcause and fix throughput and latency problems caused by overcommitting, underprovisioning, suboptimal job placement in a grid; as well as anticipate major disruptions like OOM.
Real-world applications
We're using the data collected by PSI (and its previous incarnation, memdelay) quite extensively at Facebook, and with several success stories.
One usecase is avoiding OOM hangs/livelocks. The reason these happen is because the OOM killer is triggered by reclaim not being able to free pages, but with fast flash devices there is *always* some clean and uptodate cache to reclaim; the OOM killer never kicks in, even as tasks spend 90% of the time thrashing the cache pages of their own executables. There is no situation where this ever makes sense in practice. We wrote a <100 line POC python script to monitor memory pressure and kill stuff way before such pathological thrashing leads to full system losses that would require forcible hard resets.
We've since extended and deployed this code into other places to guarantee latency and throughput SLAs, since they're usually violated way before the kernel OOM killer would ever kick in.
It is available here: https://github.com/facebookincubator/oomd
Eventually we probably want to trigger the in-kernel OOM killer based on extreme sustained pressure as well, so that Linux can avoid memory livelocks - which technically aren't deadlocks, but to the user indistinguishable from them - out of the box. We'd continue using OOMD as the first line of defense to ensure workload health and implement complex kill policies that are beyond the scope of the kernel.
We also use PSI memory pressure for loadshedding. Our batch job infrastructure used to use heuristics based on various VM stats to anticipate OOM situations, with lackluster success. We switched it to PSI and managed to anticipate and avoid OOM kills and lockups fairly reliably. The reduction of OOM outages in the worker pool raised the pool's aggregate productivity, and we were able to switch that service to smaller machines.
Lastly, we use cgroups to isolate a machine's main workload from maintenance crap like package upgrades, logging, configuration, as well as to prevent multiple workloads on a machine from stepping on each others' toes. We were not able to configure this properly without the pressure metrics; we would see latency or bandwidth drops, but it would often be hard to impossible to rootcause it post-mortem.
We now log and graph pressure for the containers in our fleet and can trivially link latency spikes and throughput drops to shortages of specific resources after the fact, and fix the job config/scheduling.
PSI has also received testing, feedback, and feature requests from Android and EndlessOS for the purpose of low-latency OOM killing, to intervene in pressure situations before the UI starts hanging.
How do you use this feature?
A kernel with CONFIG_PSI=y will create a /proc/pressure directory with 3 files: cpu, memory, and io. If using cgroup2, cgroups will also have cpu.pressure, memory.pressure and io.pressure files, which simply aggregate task stalls at the cgroup level instead of system-wide.
The cpu file contains one line:
some avg10=2.04 avg60=0.75 avg300=0.40 total=157656722
The averages give the percentage of walltime in which one or more tasks are delayed on the runqueue while another task has the CPU. They're recent averages over 10s, 1m, 5m windows, so you can tell short term trends from long term ones, similarly to the load average.
The total= value gives the absolute stall time in microseconds. This allows detecting latency spikes that might be too short to sway the running averages. It also allows custom time averaging in case the 10s/1m/5m windows aren't adequate for the usecase (or are too coarse with future hardware).
What to make of this "some" metric? If CPU utilization is at 100% and CPU pressure is 0, it means the system is perfectly utilized, with one runnable thread per CPU and nobody waiting. At two or more runnable tasks per CPU, the system is 100% overcommitted and the pressure average will indicate as much. From a utilization perspective this is a great state of course: no CPU cycles are being wasted, even when 50% of the threads were to go idle (as most workloads do vary). From the perspective of the individual job it's not great, however, and they would do better with more resources. Depending on what your priority and options are, raised "some" numbers may or may not require action.
The memory file contains two lines:
some avg10=70.24 avg60=68.52 avg300=69.91 total=3559632828 full avg10=57.59 avg60=58.06 avg300=60.38 total=3300487258
The some line is the same as for cpu, the time in which at least one task is stalled on the resource. In the case of memory, this includes waiting on swap-in, page cache refaults and page reclaim.
The full line, however, indicates time in which *nobody* is using the CPU productively due to pressure: all non-idle tasks are waiting for memory in one form or another. Significant time spent in there is a good trigger for killing things, moving jobs to other machines, or dropping incoming requests, since neither the jobs nor the machine overall are making too much headway.
The io file is similar to memory. Because the block layer doesn't have a concept of hardware contention right now (how much longer is my IO request taking due to other tasks?), it reports CPU potential lost on all IO delays, not just the potential lost due to competition.
FAQ
Q: How is PSI's CPU component different from the load average?
A: There are several quirks in the load average that make it hard to impossible to tell how overcommitted the CPU really is.
1. The load average is reported as a raw number of active tasks. You need to know how many CPUs there are in the system, how many CPUs the workload is allowed to use, then think about what the proportion between load and the number of CPUs mean for the tasks trying to run.
PSI reports the percentage of wallclock time in which tasks are waiting for a CPU to run on. It doesn't matter how many CPUs are present or usable. The number always tells the quality of life of tasks in the system or in a particular cgroup.
2. The shortest averaging window is 1m, which is extremely coarse, and it's sampled in 5s intervals. A *lot* can happen on a CPU in 5 seconds. This *may* be able to identify persistent long-term trends and very clear and obvious overloads, but it's unusable for latency spikes and more subtle overutilization.
PSI's shortest window is 10s. It also exports the cumulative stall times (in microseconds) of synchronously recorded events.
3. On Linux, the load average for historical reasons includes all TASK_UNINTERRUPTIBLE tasks. This gives a broader sense of how busy the system is, but on the flipside it doesn't distinguish whether tasks are likely to contend over the CPU or IO - which obviously requires very different interventions from a sys admin or a job scheduler.
PSI reports independent metrics for CPU and IO. You can tell which resource is making the tasks wait, but in conjunction still see how overloaded the system is overall.
Q: What's the cost / performance impact of this feature?
A: PSI's primary cost is in the scheduler, in particular task wakeups and sleeps.
I benchmarked this code using Facebook's two most scheduling sensitive workloads: memcache and webserver. They handle a ton of small requests - lots of wakeups and sleeps with little actual work in between - so they tend to be canaries for scheduler regressions.
In the tests, the boxes were handling live traffic over the course of several hours. Half the machines, the control, ran with CONFIG_PSI=n.
For memcache I used eight machines total. They're 2-socket, 14 core, 56 thread boxes. The test runs for half the test period, flips the test and control kernels on the hardware to rule out HW factors, DC location etc., then runs the other half of the test.
For the webservers, I used 32 machines total. They're single socket, 16 core, 32 thread machines.
During the memcache test, CPU load was nopsi=78.05% psi=78.98% in the first half and nopsi=77.52% psi=78.25%, so PSI added between 0.7 and 0.9 percentage points to the CPU load, a difference of about 1%.
UPDATE: I re-ran this test with the v3 version of this patch set and the CPU utilization was equivalent between test and control.
UPDATE: v4 is on par with v3.
As far as end-to-end request latency from the client perspective goes, we don't sample those finely enough to capture the requests going to those particular machines during the test, but we know the p50 turnaround time in this workload is 54us, and perf bench sched pipe on those machines show nopsi=5.232666 us/op and psi=5.587347 us/op, so this doesn't add much here either.
The profile for the pipe benchmark shows:
0.87% sched-pipe [kernel.vmlinux] [k] psi_group_change 0.83% perf.real [kernel.vmlinux] [k] psi_group_change 0.82% perf.real [kernel.vmlinux] [k] psi_task_change 0.58% sched-pipe [kernel.vmlinux] [k] psi_task_change
The webserver load is running inside 4 nested cgroup levels. The CPU load with both nopsi and psi kernels was indistinguishable at 81%.
For comparison, we had to disable the cgroup cpu controller on the webservers because it added 4 percentage points to the CPU% during this same exact test.
Versions of this accounting code now run on 80% of our fleet. None of our workloads have reported regressions during the rollout.
Daniel Drake said:
: I just retested the latest version at : http://git.cmpxchg.org/cgit.cgi/linux-psi.git (Linux 4.18) and the results : are great. : : Test setup: : Endless OS : GeminiLake N4200 low end laptop : 2GB RAM : swap (and zram swap) disabled : : Baseline test: open a handful of large-ish apps and several website : tabs in Google Chrome. : : Results: after a couple of minutes, system is excessively thrashing, mouse : cursor can barely be moved, UI is not responding to mouse clicks, so it's : impractical to recover from this situation as an ordinary user : : Add my simple killer: : https://gist.github.com/dsd/a8988bf0b81a6163475988120fe8d9cd : : Results: when the thrashing causes the UI to become sluggish, the killer : steps in and kills something (usually a chrome tab), and the system : remains usable. I repeatedly opened more apps and more websites over a 15 : minute period but I wasn't able to get the system to a point of UI : unresponsiveness.
Suren said:
: Backported to 4.9 and retested on ARMv8 8 code system running Android. : Signals behave as expected reacting to memory pressure, no jumps in : "total" counters that would indicate an overflow/underflow issues. Nicely : done!
This patch (of 9):
If we keep just enough refault information to match the *current* page cache during reclaim time, we could lose a lot of events when there is only a temporary spike in non-cache memory consumption that pushes out all the cache. Once cache comes back, we won't see those refaults. They might not be actionable for LRU aging, but we want to know about them for measuring memory pressure.
[hannes@cmpxchg.org: switch to NUMA-aware lru and slab counters] Link: http://lkml.kernel.org/r/20181009184732.762-2-hannes@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner jweiner@fb.com Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Rik van Riel riel@surriel.com Tested-by: Daniel Drake drake@endlessm.com Tested-by: Suren Baghdasaryan surenb@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Tejun Heo tj@kernel.org Cc: Vinayak Menon vinmenon@codeaurora.org Cc: Christopher Lameter cl@linux.com Cc: Peter Enderborg peter.enderborg@sony.com Cc: Shakeel Butt shakeelb@google.com Cc: Mike Galbraith efault@gmx.de Cc: Randy Dunlap rdunlap@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 95f9ab2d596e8cbb388315e78c82b9a131bf2928) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit bcc6cb7ab8bbfb8d2c005a117d233134cd2ea1d5) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/workingset.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd7901291..7d5fa0dd2b380 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -364,7 +364,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long cache; + unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
@@ -390,14 +390,20 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ +#ifdef CONFIG_MEMCG if (sc->memcg) { - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + struct lruvec *lruvec; + + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL); + lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + } else +#endif + pages = node_present_pages(sc->nid); + + max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3);
if (!nodes) return SHRINK_EMPTY;
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-5.2-rc1 commit 2b487e59f00aaa885ebf9c47d44d09f3ef4df80e category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- mem_cgroup_node_nr_lru_pages() is just a convenience wrapper around lruvec_page_state() that takes bitmasks of lru indexes and aggregates the counts for those.
Replace callsites where the bitmask is simple enough with direct lruvec_page_state() calls.
This removes the last extern user of mem_cgroup_node_nr_lru_pages(), so make that function private again, too.
Link: http://lkml.kernel.org/r/20190228163020.24100-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 2b487e59f00aaa885ebf9c47d44d09f3ef4df80e) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit fa19301eb87af01edc093c5298a19b027da9469c) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 10 ---------- mm/memcontrol.c | 10 +++++++--- mm/workingset.c | 5 +++-- 3 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4795f9ee72124..fa34e8c707597 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -524,9 +524,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages);
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask); - static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -972,13 +969,6 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return 0; }
-static inline unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - return 0; -} - static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47a5e27d55278..86a8986fdb3c6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -719,7 +719,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); }
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); @@ -1441,11 +1441,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + + if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || + lruvec_page_state(lruvec, NR_ACTIVE_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) + if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || + lruvec_page_state(lruvec, NR_ACTIVE_ANON)) return true; return false;
diff --git a/mm/workingset.c b/mm/workingset.c index 7d5fa0dd2b380..e83307413b5f8 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -393,10 +393,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; + int i;
- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL); lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) + pages += lruvec_page_state(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); } else
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-5.2-rc1 commit 21d89d151bb42bea1bcf0343f724ef62509d6161 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- mem_cgroup_nr_lru_pages() is just a convenience wrapper around memcg_page_state() that takes bitmasks of lru indexes and aggregates the counts for those.
Replace callsites where the bitmask is simple enough with direct memcg_page_state() call(s).
Link: http://lkml.kernel.org/r/20190228163020.24100-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 21d89d151bb42bea1bcf0343f724ef62509d6161) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit ddc1d9cb1a52052b6f9f2d70658c9ca90fd638a1) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 86a8986fdb3c6..ee1a202e60344 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1377,7 +1377,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
for (i = 0; i < NR_LRU_LISTS; i++) pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + K(memcg_page_state(iter, NR_LRU_BASE + i)));
pr_cont("\n"); } @@ -3074,8 +3074,8 @@ static void accumulate_memcg_tree(struct mem_cgroup *memcg, acc->events_array ? acc->events_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++) - acc->lru_pages[i] += - mem_cgroup_nr_lru_pages(mi, BIT(i)); + acc->lru_pages[i] += memcg_page_state(mi, + NR_LRU_BASE + i); } }
@@ -3700,7 +3700,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], - mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); + memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE);
/* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; @@ -4213,8 +4214,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
/* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); - *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | - (1 << LRU_ACTIVE_FILE)); + *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); *pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-5.2-rc1 commit e0ee0e71078abbcadd4cbc38fb8570551fccc103 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- Patch series "mm: memcontrol: clean up the LRU counts tracking".
The memcg LRU stats usage is currently a bit messy. Memcg has private per-zone counters because reclaim needs zone granularity sometimes, but we also have plenty of users that need to awkwardly sum them up to node or memcg granularity. Meanwhile the canonical per-memcg vmstats do not track the LRU counts (NR_INACTIVE_ANON etc.) as you'd expect.
This series enables LRU count tracking in the per-memcg vmstats array such that lruvec_page_state() and memcg_page_state() work on the enum node_stat_item items for the LRU counters. Then it converts all the callers that don't specifically need per-zone numbers over to that.
This patch (of 6):
The memcg code currently maintains private per-zone breakdowns of the LRU counters. This is necessary for reclaim decisions which are still zone-based, but there are a variety of users of these counters that only want the aggregate per-lruvec or per-memcg LRU counts, and they need to painfully sum up the zone counters on each request for that.
These would be better served using the memcg vmstats arrays, which track VM statistics at the desired scope already. They just don't have the LRU counts right now.
So to kick off the conversion, begin tracking LRU counts in those.
Link: http://lkml.kernel.org/r/20190228163020.24100-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Roman Gushchin guro@fb.com Cc: Tejun Heo tj@kernel.org Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit e0ee0e71078abbcadd4cbc38fb8570551fccc103) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_inline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 10191c28fc04c..b0e3b4473ff2e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, { struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); }
From: Chris Down chris@chrisdown.name
mainline inclusion from mainline-5.1-rc1 commit 1ff9e6e1798c7670ea6a7680a1ad5582df2fa914 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Currently THP allocation events data is fairly opaque, since you can only get it system-wide. This patch makes it easier to reason about transparent hugepage behaviour on a per-memcg basis.
For anonymous THP-backed pages, we already have MEMCG_RSS_HUGE in v1, which is used for v1's rss_huge [sic]. This is reused here as it's fairly involved to untangle NR_ANON_THPS right now to make it per-memcg, since right now some of this is delegated to rmap before we have any memcg actually assigned to the page. It's a good idea to rework that, but let's leave untangling THP allocation for a future patch.
[akpm@linux-foundation.org: fix build] [chris@chrisdown.name: fix memcontrol build when THP is disabled] Link: http://lkml.kernel.org/r/20190131160802.GA5777@chrisdown.name Link: http://lkml.kernel.org/r/20190129205852.GA7310@chrisdown.name Signed-off-by: Chris Down chris@chrisdown.name Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Tejun Heo tj@kernel.org Cc: Roman Gushchin guro@fb.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 1ff9e6e1798c7670ea6a7680a1ad5582df2fa914) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com
Conflicts: mm/memcontrol.c
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit c0eb53caa3758dfbde7c13960431f703b3fb4184) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/admin-guide/cgroup-v2.rst | 16 ++++++++++++++++ mm/huge_memory.c | 2 ++ mm/khugepaged.c | 2 ++ mm/memcontrol.c | 16 ++++++++++++++++ 4 files changed, 36 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 184193bcb262a..9c194bc863740 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1177,6 +1177,10 @@ PAGE_SIZE multiple when read back. Amount of cached filesystem data that was modified and is currently being written back to disk
+ anon_thp + Amount of memory used in anonymous mappings backed by + transparent hugepages + inactive_anon, active_anon, inactive_file, active_file, unevictable Amount of memory, swap-backed and filesystem-backed, on the internal memory management lists used by the @@ -1236,6 +1240,18 @@ PAGE_SIZE multiple when read back.
Amount of reclaimed lazyfree pages
+ thp_fault_alloc + + Number of transparent hugepages which were allocated to satisfy + a page fault, including COW faults. This counter is not present + when CONFIG_TRANSPARENT_HUGEPAGE is not set. + + thp_collapse_alloc + + Number of transparent hugepages which were allocated to allow + collapsing an existing range of pages. This counter is not + present when CONFIG_TRANSPARENT_HUGEPAGE is not set. + memory.swap.current A read-only single value file which exists on non-root cgroups. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 96b35b295bdf8..42b2cd0a25fcf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -681,6 +681,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); }
return 0; @@ -1439,6 +1440,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) }
count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 669404342fbe6..ad386978d7e05 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1092,6 +1092,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1546,6 +1547,7 @@ static void collapse_shmem(struct mm_struct *mm, page_ref_add(new_page, HPAGE_PMD_NR - 1); set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_anon(new_page);
/* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ee1a202e60344..3eeeb63c6d5ea 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> +#include <linux/vm_event_item.h> #include <linux/smp.h> #include <linux/page-flags.h> #include <linux/backing-dev.h> @@ -5925,6 +5926,15 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file_writeback %llu\n", (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
+ /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_printf(m, "anon_thp %llu\n", + (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], (u64)acc.lru_pages[i] * PAGE_SIZE); @@ -5956,6 +5966,12 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "workingset_nodereclaim %lu\n", acc.stat[WORKINGSET_NODERECLAIM]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_collapse_alloc %lu\n", + acc.events[THP_COLLAPSE_ALLOC]); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return 0; }
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v4.20-rc1 commit e9b257ed150c1f43912bd66031185598451f68a9 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
The refault stats go better with the page fault stats, and are of higher interest than the stats on LRU operations. In fact they used to be grouped together; when the LRU operation stats were added later on, they were wedged in between.
Move them back together. Documentation/admin-guide/cgroup-v2.rst already lists them in the right order.
Link: http://lkml.kernel.org/r/20181010140239.GA2527@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Cc: Rik van Riel riel@redhat.com Cc: Michal Hocko mhocko@suse.com Cc: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org
Confilicts: mm/memcontrol.c
Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit fff2d6a5395de923e867c0303ee332dc393cd553) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eeeb63c6d5ea..867ce4d091d70 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5949,6 +5949,13 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
+ seq_printf(m, "workingset_refault %lu\n", + acc.stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + acc.stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + acc.stat[WORKINGSET_NODERECLAIM]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + acc.events[PGSCAN_DIRECT]); @@ -5959,13 +5966,6 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
- seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); - seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); - seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); seq_printf(m, "thp_collapse_alloc %lu\n",
From: Chris Down chris@chrisdown.name
mainline inclusion from mainline-v5.2-rc1 commit 871789d4af807d1e91a6299f12a67e06177ed420 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
I spent literally an hour trying to work out why an earlier version of my memory.events aggregation code doesn't work properly, only to find out I was calling memcg->events instead of memcg->memory_events, which is fairly confusing.
This naming seems in need of reworking, so make it harder to do the wrong thing by using vmevents instead of events, which makes it more clear that these are vm counters rather than memcg-specific counters.
There are also a few other inconsistent names in both the percpu and aggregated structs, so these are all cleaned up to be more coherent and easy to understand.
This commit contains code cleanup only: there are no logic changes.
[akpm@linux-foundation.org: fix it for preceding changes] Link: http://lkml.kernel.org/r/20190208224319.GA23801@chrisdown.name Signed-off-by: Chris Down chris@chrisdown.name Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@kernel.org Cc: Tejun Heo tj@kernel.org Cc: Roman Gushchin guro@fb.com Cc: Dennis Zhou dennis@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-pick commit from 9c05c7de252cfe92ed38d15b9f7966a6b75759a5) Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 24 +++--- mm/memcontrol.c | 148 +++++++++++++++++++------------------ 2 files changed, 88 insertions(+), 84 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa34e8c707597..95a437c81c600 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,8 +94,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, };
-struct mem_cgroup_stat_cpu { - long count[MEMCG_NR_STAT]; +struct memcg_vmstats_percpu { + long stat[MEMCG_NR_STAT]; unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; @@ -273,12 +273,12 @@ struct mem_cgroup { struct task_struct *move_lock_task;
/* memory.stat */ - struct mem_cgroup_stat_cpu __percpu *stat_cpu; + struct memcg_vmstats_percpu __percpu *vmstats_percpu;
MEMCG_PADDING(_pad2_);
- atomic_long_t stat[MEMCG_NR_STAT]; - atomic_long_t events[NR_VM_EVENT_ITEMS]; + atomic_long_t vmstats[MEMCG_NR_STAT]; + atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
unsigned long socket_pressure; @@ -580,7 +580,7 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->stat[idx]); + long x = atomic_long_read(&memcg->vmstats[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -597,12 +597,12 @@ static inline void __mod_memcg_state(struct mem_cgroup *memcg, if (mem_cgroup_disabled()) return;
- x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->stat[idx]); + atomic_long_add(x, &memcg->vmstats[idx]); x = 0; } - __this_cpu_write(memcg->stat_cpu->count[idx], x); + __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); }
/* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -740,12 +740,12 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg, if (mem_cgroup_disabled()) return;
- x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); + x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->events[idx]); + atomic_long_add(x, &memcg->vmevents[idx]); x = 0; } - __this_cpu_write(memcg->stat_cpu->events[idx], x); + __this_cpu_write(memcg->vmstats_percpu->events[idx], x); }
static inline void count_memcg_events(struct mem_cgroup *memcg, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 867ce4d091d70..746ef6f53a878 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -685,7 +685,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) static unsigned long memcg_sum_events(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->events[event]); + return atomic_long_read(&memcg->vmevents[event]); }
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -717,7 +717,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ }
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); + __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); }
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, @@ -756,8 +756,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next;
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events); - next = __this_cpu_read(memcg->stat_cpu->targets[target]); + val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); + next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -773,7 +773,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat_cpu->targets[target], next); + __this_cpu_write(memcg->vmstats_percpu->targets[target], next); return true; } return false; @@ -2144,9 +2144,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) int nid; long x;
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); + x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); if (x) - atomic_long_add(x, &memcg->stat[i]); + atomic_long_add(x, &memcg->vmstats[i]);
if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2164,9 +2164,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x;
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); + x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); if (x) - atomic_long_add(x, &memcg->events[i]); + atomic_long_add(x, &memcg->vmevents[i]); } }
@@ -3049,30 +3049,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; }
-struct accumulated_stats { - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; +struct accumulated_vmstats { + unsigned long vmstats[MEMCG_NR_STAT]; + unsigned long vmevents[NR_VM_EVENT_ITEMS]; unsigned long lru_pages[NR_LRU_LISTS]; - const unsigned int *stats_array; - const unsigned int *events_array; - int stats_size; - int events_size; + + /* overrides for v1 */ + const unsigned int *vmstats_array; + const unsigned int *vmevents_array; + + int vmstats_size; + int vmevents_size; };
-static void accumulate_memcg_tree(struct mem_cgroup *memcg, - struct accumulated_stats *acc) +static void accumulate_vmstats(struct mem_cgroup *memcg, + struct accumulated_vmstats *acc) { struct mem_cgroup *mi; int i;
for_each_mem_cgroup_tree(mi, memcg) { - for (i = 0; i < acc->stats_size; i++) - acc->stat[i] += memcg_page_state(mi, - acc->stats_array ? acc->stats_array[i] : i); + for (i = 0; i < acc->vmstats_size; i++) + acc->vmstats[i] += memcg_page_state(mi, + acc->vmstats_array ? acc->vmstats_array[i] : i);
- for (i = 0; i < acc->events_size; i++) - acc->events[i] += memcg_sum_events(mi, - acc->events_array ? acc->events_array[i] : i); + for (i = 0; i < acc->vmevents_size; i++) + acc->vmevents[i] += memcg_sum_events(mi, + acc->vmevents_array + ? acc->vmevents_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++) acc->lru_pages[i] += memcg_page_state(mi, @@ -3682,7 +3686,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; - struct accumulated_stats acc; + struct accumulated_vmstats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3717,22 +3721,22 @@ static int memcg_stat_show(struct seq_file *m, void *v) (u64)memsw * PAGE_SIZE);
memset(&acc, 0, sizeof(acc)); - acc.stats_size = ARRAY_SIZE(memcg1_stats); - acc.stats_array = memcg1_stats; - acc.events_size = ARRAY_SIZE(memcg1_events); - acc.events_array = memcg1_events; - accumulate_memcg_tree(memcg, &acc); + acc.vmstats_size = ARRAY_SIZE(memcg1_stats); + acc.vmstats_array = memcg1_stats; + acc.vmevents_size = ARRAY_SIZE(memcg1_events); + acc.vmevents_array = memcg1_events; + accumulate_vmstats(memcg, &acc);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)acc.stat[i] * PAGE_SIZE); + (u64)acc.vmstats[i] * PAGE_SIZE); }
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)acc.events[i]); + (u64)acc.vmevents[i]);
for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], @@ -4176,11 +4180,11 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) */ static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->stat[idx]); + long x = atomic_long_read(&memcg->vmstats[idx]); int cpu;
for_each_online_cpu(cpu) - x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; + x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; if (x < 0) x = 0; return x; @@ -4730,7 +4734,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->stat_cpu); + free_percpu(memcg->vmstats_percpu);
memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); kfree(memcg_ext); @@ -4763,8 +4767,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail;
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat_cpu) + memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); + if (!memcg->vmstats_percpu) goto fail;
for_each_node(node) @@ -5886,7 +5890,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - struct accumulated_stats acc; + struct accumulated_vmstats acc; int i;
/* @@ -5901,30 +5905,30 @@ static int memory_stat_show(struct seq_file *m, void *v) */
memset(&acc, 0, sizeof(acc)); - acc.stats_size = MEMCG_NR_STAT; - acc.events_size = NR_VM_EVENT_ITEMS; - accumulate_memcg_tree(memcg, &acc); + acc.vmstats_size = MEMCG_NR_STAT; + acc.vmevents_size = NR_VM_EVENT_ITEMS; + accumulate_vmstats(memcg, &acc);
seq_printf(m, "anon %llu\n", - (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); + (u64)acc.vmstats[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)acc.vmstats[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)acc.vmstats[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + - acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(acc.vmstats[NR_SLAB_RECLAIMABLE] + + acc.vmstats[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)acc.vmstats[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "shmem %llu\n", - (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); + (u64)acc.vmstats[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)acc.vmstats[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)acc.vmstats[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); + (u64)acc.vmstats[NR_WRITEBACK] * PAGE_SIZE);
/* * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter @@ -5933,43 +5937,43 @@ static int memory_stat_show(struct seq_file *m, void *v) * where the page->mem_cgroup is set up and stable. */ seq_printf(m, "anon_thp %llu\n", - (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + (u64)acc.vmstats[MEMCG_RSS_HUGE] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], (u64)acc.lru_pages[i] * PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n", - (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)acc.vmstats[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)acc.vmstats[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", acc.vmevents[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", acc.vmevents[PGMAJFAULT]);
seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); + acc.vmstats[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); + acc.vmstats[WORKINGSET_ACTIVATE]); seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - - seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + - acc.events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + - acc.events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); + acc.vmstats[WORKINGSET_NODERECLAIM]); + + seq_printf(m, "pgrefill %lu\n", acc.vmevents[PGREFILL]); + seq_printf(m, "pgscan %lu\n", acc.vmevents[PGSCAN_KSWAPD] + + acc.vmevents[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", acc.vmevents[PGSTEAL_KSWAPD] + + acc.vmevents[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", acc.vmevents[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", acc.vmevents[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", acc.vmevents[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", acc.vmevents[PGLAZYFREED]);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_fault_alloc %lu\n", acc.vmevents[THP_FAULT_ALLOC]); seq_printf(m, "thp_collapse_alloc %lu\n", - acc.events[THP_COLLAPSE_ALLOC]); + acc.vmevents[THP_COLLAPSE_ALLOC]); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return 0; @@ -6411,7 +6415,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags);
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-5.2-rc1 commit 113b7dfd827175977ea71cc4a29c1ac24acb9fce category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
------------------------------------------------- Only memcg_numa_stat_show() uses those wrappers and the lru bitmasks, group them together.
Link: http://lkml.kernel.org/r/20190228163020.24100-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 113b7dfd827175977ea71cc4a29c1ac24acb9fce) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit 45cae6e90bca5194506b54a0eb86735a56dafd39) Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mmzone.h | 5 ---- mm/memcontrol.c | 67 +++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 36 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 08ea0f24077e3..e7d2bca35682e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -246,11 +246,6 @@ struct lruvec { #endif };
-/* Mask used at gathering information at once (see memcontrol.c) */ -#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) -#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) -#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) - /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 746ef6f53a878..5c0d6f1442bb7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -720,37 +720,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); }
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - unsigned long nr = 0; - enum lru_list lru; - - VM_BUG_ON((unsigned)nid >= nr_node_ids); - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); - } - return nr; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) -{ - unsigned long nr = 0; - enum lru_list lru; - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += memcg_page_state(memcg, NR_LRU_BASE + lru); - } - return nr; -} - static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, enum mem_cgroup_events_target target) { @@ -3615,6 +3584,42 @@ int sysctl_memcg_qos_handler(struct ctl_table *table, int write, #endif
#ifdef CONFIG_NUMA + +#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) +#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) +#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + unsigned long nr = 0; + enum lru_list lru; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + } + return nr; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, + unsigned int lru_mask) +{ + unsigned long nr = 0; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + } + return nr; +} + static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat {
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v5.2-rc1 commit 205b20cc5a99cdf197c32f4dbee2b09c699477f0 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Patch series "mm: memcontrol: memory.stat cost & correctness".
The cgroup memory.stat file holds recursive statistics for the entire subtree. The current implementation does this tree walk on-demand whenever the file is read. This is giving us problems in production.
1. The cost of aggregating the statistics on-demand is high. A lot of system service cgroups are mostly idle and their stats don't change between reads, yet we always have to check them. There are also always some lazily-dying cgroups sitting around that are pinned by a handful of remaining page cache; the same applies to them.
In an application that periodically monitors memory.stat in our fleet, we have seen the aggregation consume up to 5% CPU time.
2. When cgroups die and disappear from the cgroup tree, so do their accumulated vm events. The result is that the event counters at higher-level cgroups can go backwards and confuse some of our automation, let alone people looking at the graphs over time.
To address both issues, this patch series changes the stat implementation to spill counts upwards when the counters change.
The upward spilling is batched using the existing per-cpu cache. In a sparse file stress test with 5 level cgroup nesting, the additional cost of the flushing was negligible (a little under 1% of CPU at 100% CPU utilization, compared to the 5% of reading memory.stat during regular operation).
This patch (of 4):
memcg_page_state(), lruvec_page_state(), memcg_sum_events() are currently returning the state of the local memcg or lruvec, not the recursive state.
In practice there is a demand for both versions, although the callers that want the recursive counts currently sum them up by hand.
Per default, cgroups are considered recursive entities and generally we expect more users of the recursive counters, with the local counts being special cases. To reflect that in the name, add a _local suffix to the current implementations.
The following patch will re-incarnate these functions with recursive semantics, but with an O(1) implementation.
[hannes@cmpxchg.org: fix bisection hole] Link: http://lkml.kernel.org/r/20190417160347.GC23013@cmpxchg.org Link: http://lkml.kernel.org/r/20190412151507.2769-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Shakeel Butt shakeelb@google.com Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org
Confilicts: mm/vmscan.c
Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 16 +++++++-------- mm/memcontrol.c | 40 ++++++++++++++++++++------------------ mm/vmscan.c | 6 +++--- mm/workingset.c | 7 ++++--- 4 files changed, 36 insertions(+), 33 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 95a437c81c600..d8a7c0a95fdf9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -577,8 +577,8 @@ void unlock_page_memcg(struct page *page); * idx can be of type enum memcg_stat_item or node_stat_item. * Keep in sync with memcg_exact_page_state(). */ -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - int idx) +static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, + int idx) { long x = atomic_long_read(&memcg->vmstats[idx]); #ifdef CONFIG_SMP @@ -647,8 +647,8 @@ static inline void mod_memcg_page_state(struct page *page, mod_memcg_state(page->mem_cgroup, idx, val); }
-static inline unsigned long lruvec_page_state(struct lruvec *lruvec, - enum node_stat_item idx) +static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) { struct mem_cgroup_per_node *pn; long x; @@ -1029,8 +1029,8 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { }
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - int idx) +static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, + int idx) { return 0; } @@ -1059,8 +1059,8 @@ static inline void mod_memcg_page_state(struct page *page, { }
-static inline unsigned long lruvec_page_state(struct lruvec *lruvec, - enum node_stat_item idx) +static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c0d6f1442bb7..3808ada67f421 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -682,8 +682,8 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; }
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg, - int event) +static unsigned long memcg_events_local(struct mem_cgroup *memcg, + int event) { return atomic_long_read(&memcg->vmevents[event]); } @@ -1342,12 +1342,14 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) continue; pr_cont(" %s:%luKB", memcg1_stat_names[i], - K(memcg_page_state(iter, memcg1_stats[i]))); + K(memcg_page_state_local(iter, + memcg1_stats[i]))); }
for (i = 0; i < NR_LRU_LISTS; i++) pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(memcg_page_state(iter, NR_LRU_BASE + i))); + K(memcg_page_state_local(iter, + NR_LRU_BASE + i)));
pr_cont("\n"); } @@ -1413,13 +1415,13 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, { struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || - lruvec_page_state(lruvec, NR_ACTIVE_FILE)) + if (lruvec_page_state_local(lruvec, NR_INACTIVE_FILE) || + lruvec_page_state_local(lruvec, NR_ACTIVE_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || - lruvec_page_state(lruvec, NR_ACTIVE_ANON)) + if (lruvec_page_state_local(lruvec, NR_INACTIVE_ANON) || + lruvec_page_state_local(lruvec, NR_ACTIVE_ANON)) return true; return false;
@@ -3039,16 +3041,16 @@ static void accumulate_vmstats(struct mem_cgroup *memcg,
for_each_mem_cgroup_tree(mi, memcg) { for (i = 0; i < acc->vmstats_size; i++) - acc->vmstats[i] += memcg_page_state(mi, + acc->vmstats[i] += memcg_page_state_local(mi, acc->vmstats_array ? acc->vmstats_array[i] : i);
for (i = 0; i < acc->vmevents_size; i++) - acc->vmevents[i] += memcg_sum_events(mi, + acc->vmevents[i] += memcg_events_local(mi, acc->vmevents_array ? acc->vmevents_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++) - acc->lru_pages[i] += memcg_page_state(mi, + acc->lru_pages[i] += memcg_page_state_local(mi, NR_LRU_BASE + i); } } @@ -3061,10 +3063,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg) { - val += memcg_page_state(iter, MEMCG_CACHE); - val += memcg_page_state(iter, MEMCG_RSS); + val += memcg_page_state_local(iter, MEMCG_CACHE); + val += memcg_page_state_local(iter, MEMCG_RSS); if (swap) - val += memcg_page_state(iter, MEMCG_SWAP); + val += memcg_page_state_local(iter, MEMCG_SWAP); } } else { if (!swap) @@ -3601,7 +3603,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); } return nr; } @@ -3615,7 +3617,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); } return nr; } @@ -3700,17 +3702,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state(memcg, memcg1_stats[i]) * + memcg_page_state_local(memcg, memcg1_stats[i]) * PAGE_SIZE); }
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "%s %lu\n", memcg1_event_names[i], - memcg_sum_events(memcg, memcg1_events[i])); + memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], - memcg_page_state(memcg, NR_LRU_BASE + i) * + memcg_page_state_local(memcg, NR_LRU_BASE + i) * PAGE_SIZE);
/* Hierarchical information */ diff --git a/mm/vmscan.c b/mm/vmscan.c index d2c268ca401c5..f3187be99d3d6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -356,7 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone int zid;
if (!mem_cgroup_disabled()) - lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru); + lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
@@ -2254,7 +2254,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, * is being established. Disable active list protection to get * rid of the stale workingset quickly. */ - refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); + refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); if (file && lruvec->refaults != refaults) { inactive_ratio = 0; } else { @@ -3013,7 +3013,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(pgdat, memcg); - refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); + refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); lruvec->refaults = refaults; } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); } diff --git a/mm/workingset.c b/mm/workingset.c index e83307413b5f8..a1f61b3a0cd3d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -397,9 +397,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) - pages += lruvec_page_state(lruvec, NR_LRU_BASE + i); - pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); - pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + pages += lruvec_page_state_local(lruvec, + NR_LRU_BASE + i); + pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE); } else #endif pages = node_present_pages(sc->nid);
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v5.2-rc1 commit db9adbcbe740e0986b575dd56aad834ce9e9b5d3 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
These are getting too big to be inlined in every callsite. They were stolen from vmstat.c, which already out-of-lines them, and they have only been growing since. The callsites aren't that hot, either.
Move __mod_memcg_state() __mod_lruvec_state() and __count_memcg_events() out of line and add kerneldoc comments.
Link: http://lkml.kernel.org/r/20190412151507.2769-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Shakeel Butt shakeelb@google.com Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 62 +++--------------------------- mm/memcontrol.c | 79 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 57 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d8a7c0a95fdf9..49ccb3148584d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -588,22 +588,7 @@ static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, return x; }
-/* idx can be of type enum memcg_stat_item or node_stat_item */ -static inline void __mod_memcg_state(struct mem_cgroup *memcg, - int idx, int val) -{ - long x; - - if (mem_cgroup_disabled()) - return; - - x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->vmstats[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); -} +void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, @@ -665,31 +650,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; }
-static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - struct mem_cgroup_per_node *pn; - long x; - - /* Update node */ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); - - if (mem_cgroup_disabled()) - return; - - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - - /* Update memcg */ - __mod_memcg_state(pn->memcg, idx, val); - - /* Update lruvec */ - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &pn->lruvec_stat[idx]); - x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); -} +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val);
static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) @@ -731,22 +693,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned);
-static inline void __count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ - unsigned long x; - - if (mem_cgroup_disabled()) - return; - - x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); - if (unlikely(x > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->vmevents[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->events[idx], x); -} +void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count);
static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3808ada67f421..34bc0bb9648bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -682,6 +682,85 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; }
+/** + * __mod_memcg_state - update cgroup memory statistics + * @memcg: the memory cgroup + * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item + * @val: delta to add to the counter, can be negative + */ +void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) +{ + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->vmstats[idx]); + x = 0; + } + __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); +} + +/** + * __mod_lruvec_state - update lruvec memory statistics + * @lruvec: the lruvec + * @idx: the stat item + * @val: delta to add to the counter, can be negative + * + * The lruvec is the intersection of the NUMA node and a cgroup. This + * function updates the all three counters that are affected by a + * change of state at this level: per-node, per-cgroup, per-lruvec. + */ +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val) +{ + struct mem_cgroup_per_node *pn; + long x; + + /* Update node */ + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + + if (mem_cgroup_disabled()) + return; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + + /* Update memcg */ + __mod_memcg_state(pn->memcg, idx, val); + + /* Update lruvec */ + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &pn->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); +} + +/** + * __count_memcg_events - account VM events in a cgroup + * @memcg: the memory cgroup + * @idx: the event item + * @count: the number of events that occured + */ +void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count) +{ + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->vmevents[idx]); + x = 0; + } + __this_cpu_write(memcg->vmstats_percpu->events[idx], x); +} + static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) {
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v5.2-rc1 commit 42a300353577ccc17ecc627b8570a89fa1678bec category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Right now, when somebody needs to know the recursive memory statistics and events of a cgroup subtree, they need to walk the entire subtree and sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters should all be 0 at that point, of course, but the events are not. When this happens, the event counters, which are supposed to be monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily freed cgroups sitting around that have been deleted, have no tasks, but have a few cache pages remaining. These groups' statistics do not change until we eventually hit memory pressure, but somebody watching, say, memory.stat on an ancestor has to iterate those every time.
This patch addresses both issues by introducing recursive counters at each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the local atomic counter. This is the same thing we do during charge and uncharge, except that the latter uses atomic RMWs, which are more expensive; stat changes happen at around the same rate. In a sparse file test (page faults and reclaim at maximum CPU speed) with 5 cgroup nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Shakeel Butt shakeelb@google.com Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-pick commit from a7293860df418a1a7e37822d251af8d748a8f69e) Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 54 +++++++++- mm/memcontrol.c | 205 ++++++++++++++++++------------------- 2 files changed, 150 insertions(+), 109 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 49ccb3148584d..77af53f47afda 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -128,6 +128,7 @@ struct mem_cgroup_per_node {
struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + atomic_long_t lruvec_stat_local[NR_VM_NODE_STAT_ITEMS];
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
@@ -278,8 +279,12 @@ struct mem_cgroup { MEMCG_PADDING(_pad2_);
atomic_long_t vmstats[MEMCG_NR_STAT]; + atomic_long_t vmstats_local[MEMCG_NR_STAT]; + atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; + atomic_long_t vmevents_local[NR_VM_EVENT_ITEMS]; + + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
unsigned long socket_pressure;
@@ -573,6 +578,20 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page);
+/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = atomic_long_read(&memcg->vmstats[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /* * idx can be of type enum memcg_stat_item or node_stat_item. * Keep in sync with memcg_exact_page_state(). @@ -580,7 +599,7 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->vmstats[idx]); + long x = atomic_long_read(&memcg->vmstats_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -632,6 +651,24 @@ static inline void mod_memcg_page_state(struct page *page, mod_memcg_state(page->mem_cgroup, idx, val); }
+static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + struct mem_cgroup_per_node *pn; + long x; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + x = atomic_long_read(&pn->lruvec_stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { @@ -642,7 +679,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat[idx]); + x = atomic_long_read(&pn->lruvec_stat_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -977,6 +1014,11 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { }
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -1007,6 +1049,12 @@ static inline void mod_memcg_page_state(struct page *page, { }
+static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + return node_page_state(lruvec_pgdat(lruvec), idx); +} + static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 34bc0bb9648bb..d7570cd5c2078 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -697,12 +697,27 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->vmstats[idx]); + struct mem_cgroup *mi; + + atomic_long_add(x, &memcg->vmstats_local[idx]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &mi->vmstats[idx]); x = 0; } __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); }
+static struct mem_cgroup_per_node * +parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) +{ + struct mem_cgroup *parent; + + parent = parent_mem_cgroup(pn->memcg); + if (!parent) + return NULL; + return mem_cgroup_nodeinfo(parent, nid); +} + /** * __mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec @@ -716,24 +731,31 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { + pg_data_t *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; long x;
/* Update node */ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + __mod_node_page_state(pgdat, idx, val);
if (mem_cgroup_disabled()) return;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg;
/* Update memcg */ - __mod_memcg_state(pn->memcg, idx, val); + __mod_memcg_state(memcg, idx, val);
/* Update lruvec */ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &pn->lruvec_stat[idx]); + struct mem_cgroup_per_node *pi; + + atomic_long_add(x, &pn->lruvec_stat_local[idx]); + for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) + atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; } __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); @@ -755,18 +777,26 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->vmevents[idx]); + struct mem_cgroup *mi; + + atomic_long_add(x, &memcg->vmevents_local[idx]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &mi->vmevents[idx]); x = 0; } __this_cpu_write(memcg->vmstats_percpu->events[idx], x); }
-static unsigned long memcg_events_local(struct mem_cgroup *memcg, - int event) +static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { return atomic_long_read(&memcg->vmevents[event]); }
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) +{ + return atomic_long_read(&memcg->vmevents_local[event]); +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, bool compound, int nr_pages) @@ -2182,7 +2212,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *mi;
stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); @@ -2195,8 +2225,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x;
x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); - if (x) - atomic_long_add(x, &memcg->vmstats[i]); + if (x) { + atomic_long_add(x, &memcg->vmstats_local[i]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &memcg->vmstats[i]); + }
if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2206,8 +2239,12 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
pn = mem_cgroup_nodeinfo(memcg, nid); x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) - atomic_long_add(x, &pn->lruvec_stat[i]); + if (x) { + atomic_long_add(x, &pn->lruvec_stat_local[i]); + do { + atomic_long_add(x, &pn->lruvec_stat[i]); + } while ((pn = parent_nodeinfo(pn, nid))); + } } }
@@ -2215,8 +2252,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x;
x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); - if (x) - atomic_long_add(x, &memcg->vmevents[i]); + if (x) { + atomic_long_add(x, &memcg->vmevents_local[i]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &memcg->vmevents[i]); + } } }
@@ -3099,54 +3139,15 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; }
-struct accumulated_vmstats { - unsigned long vmstats[MEMCG_NR_STAT]; - unsigned long vmevents[NR_VM_EVENT_ITEMS]; - unsigned long lru_pages[NR_LRU_LISTS]; - - /* overrides for v1 */ - const unsigned int *vmstats_array; - const unsigned int *vmevents_array; - - int vmstats_size; - int vmevents_size; -}; - -static void accumulate_vmstats(struct mem_cgroup *memcg, - struct accumulated_vmstats *acc) -{ - struct mem_cgroup *mi; - int i; - - for_each_mem_cgroup_tree(mi, memcg) { - for (i = 0; i < acc->vmstats_size; i++) - acc->vmstats[i] += memcg_page_state_local(mi, - acc->vmstats_array ? acc->vmstats_array[i] : i); - - for (i = 0; i < acc->vmevents_size; i++) - acc->vmevents[i] += memcg_events_local(mi, - acc->vmevents_array - ? acc->vmevents_array[i] : i); - - for (i = 0; i < NR_LRU_LISTS; i++) - acc->lru_pages[i] += memcg_page_state_local(mi, - NR_LRU_BASE + i); - } -} - static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val = 0; + unsigned long val;
if (mem_cgroup_is_root(memcg)) { - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) { - val += memcg_page_state_local(iter, MEMCG_CACHE); - val += memcg_page_state_local(iter, MEMCG_RSS); - if (swap) - val += memcg_page_state_local(iter, MEMCG_SWAP); - } + val = memcg_page_state(memcg, MEMCG_CACHE) + + memcg_page_state(memcg, MEMCG_RSS); + if (swap) + val += memcg_page_state(memcg, MEMCG_SWAP); } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -3772,7 +3773,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; - struct accumulated_vmstats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3806,27 +3806,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE);
- memset(&acc, 0, sizeof(acc)); - acc.vmstats_size = ARRAY_SIZE(memcg1_stats); - acc.vmstats_array = memcg1_stats; - acc.vmevents_size = ARRAY_SIZE(memcg1_events); - acc.vmevents_array = memcg1_events; - accumulate_vmstats(memcg, &acc); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)acc.vmstats[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, i) * PAGE_SIZE); }
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)acc.vmevents[i]); + (u64)memcg_events(memcg, i));
for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], - (u64)acc.lru_pages[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM { @@ -5976,7 +5970,6 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - struct accumulated_vmstats acc; int i;
/* @@ -5990,31 +5983,27 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */
- memset(&acc, 0, sizeof(acc)); - acc.vmstats_size = MEMCG_NR_STAT; - acc.vmevents_size = NR_VM_EVENT_ITEMS; - accumulate_vmstats(memcg, &acc); - seq_printf(m, "anon %llu\n", - (u64)acc.vmstats[MEMCG_RSS] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)acc.vmstats[MEMCG_CACHE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)acc.vmstats[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); seq_printf(m, "slab %llu\n", - (u64)(acc.vmstats[NR_SLAB_RECLAIMABLE] + - acc.vmstats[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * + PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)acc.vmstats[MEMCG_SOCK] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
seq_printf(m, "shmem %llu\n", - (u64)acc.vmstats[NR_SHMEM] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)acc.vmstats[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)acc.vmstats[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)acc.vmstats[NR_WRITEBACK] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
/* * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter @@ -6023,43 +6012,47 @@ static int memory_stat_show(struct seq_file *m, void *v) * where the page->mem_cgroup is set up and stable. */ seq_printf(m, "anon_thp %llu\n", - (u64)acc.vmstats[MEMCG_RSS_HUGE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], - (u64)acc.lru_pages[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n", - (u64)acc.vmstats[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * + PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)acc.vmstats[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * + PAGE_SIZE);
/* Accumulated memory events */
- seq_printf(m, "pgfault %lu\n", acc.vmevents[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", acc.vmevents[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); + seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
seq_printf(m, "workingset_refault %lu\n", - acc.vmstats[WORKINGSET_REFAULT]); + memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_printf(m, "workingset_activate %lu\n", - acc.vmstats[WORKINGSET_ACTIVATE]); + memcg_page_state(memcg, WORKINGSET_ACTIVATE)); seq_printf(m, "workingset_nodereclaim %lu\n", - acc.vmstats[WORKINGSET_NODERECLAIM]); - - seq_printf(m, "pgrefill %lu\n", acc.vmevents[PGREFILL]); - seq_printf(m, "pgscan %lu\n", acc.vmevents[PGSCAN_KSWAPD] + - acc.vmevents[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", acc.vmevents[PGSTEAL_KSWAPD] + - acc.vmevents[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", acc.vmevents[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", acc.vmevents[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", acc.vmevents[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", acc.vmevents[PGLAZYFREED]); + memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); + + seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); + seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); + seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); + seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); + seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", acc.vmevents[THP_FAULT_ALLOC]); + seq_printf(m, "thp_fault_alloc %lu\n", + memcg_events(memcg, THP_FAULT_ALLOC)); seq_printf(m, "thp_collapse_alloc %lu\n", - acc.vmevents[THP_COLLAPSE_ALLOC]); + memcg_events(memcg, THP_COLLAPSE_ALLOC)); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return 0;
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v5.2-rc1 commit def0fdae813dbbbbb588bfc5f52856be2e842b35 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
When a cgroup is reclaimed on behalf of a configured limit, reclaim needs to round-robin through all NUMA nodes that hold pages of the memcg in question. However, when assembling the mask of candidate NUMA nodes, the code only consults the *local* cgroup LRU counters, not the recursive counters for the entire subtree. Cgroup limits are frequently configured against intermediate cgroups that do not have memory on their own LRUs. In this case, the node mask will always come up empty and reclaim falls back to scanning only the current node.
If a cgroup subtree has some memory on one node but the processes are bound to another node afterwards, the limit reclaim will never age or reclaim that memory anymore.
To fix this, use the recursive LRU counts for a cgroup subtree to determine which nodes hold memory of that cgroup.
The code has been broken like this forever, so it doesn't seem to be a problem in practice. I just noticed it while reviewing the way the LRU counters are used in general.
Link: http://lkml.kernel.org/r/20190412151507.2769-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Shakeel Butt shakeelb@google.com Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry picked from commit 01bd66c080ee6c7ae41af4c0ba226c0b84f1bfc8) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d7570cd5c2078..90cf8e2ba15fc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1524,13 +1524,13 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, { struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- if (lruvec_page_state_local(lruvec, NR_INACTIVE_FILE) || - lruvec_page_state_local(lruvec, NR_ACTIVE_FILE)) + if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || + lruvec_page_state(lruvec, NR_ACTIVE_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (lruvec_page_state_local(lruvec, NR_INACTIVE_ANON) || - lruvec_page_state_local(lruvec, NR_ACTIVE_ANON)) + if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || + lruvec_page_state(lruvec, NR_ACTIVE_ANON)) return true; return false;
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v5.2-rc5 commit 815744d75152078cde5391fc1e3c2d4424323fb6 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
The kernel test robot noticed a 26% will-it-scale pagefault regression from commit 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty"). This appears to be caused by bouncing the additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully maintained per cpu. A reader of a stats file high up in the cgroup tree would have to walk the entire subtree and collect each level's per-cpu counters to get the recursive view. This was prohibitively expensive, and so we switched to per-cpu batched updates of the local counters during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting"), reducing the complexity from nr_subgroups * nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too expensive for monitoring top-level groups, and this is when the culprit patch added hierarchy counters on each cgroup level. When the per-cpu batch size would be reached, both the local and the hierarchy counters would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized them to accelerate the hierarchy walk - and use of the local counters are becoming rarer due to replacement with hierarchical views (ongoing rework in the page reclaim and workingset code), we can make those local counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will: - update the local counter (per-cpu) - update the batch counter (per-cpu). If the batch is full: - spill the batch into the group's atomic_t - spill the batch into all ancestors' atomic_ts - empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will: - collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will: - read the atomic_t
We might be able to simplify this further and make the recursive counters unbatched per-cpu counters as well (batch upward propagation, but leave per-cpu collection to the readers), but that will require a more in-depth analysis and testing of all the callsites. Deal with the immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reported-by: kernel test robot rong.a.chen@intel.com Tested-by: kernel test robot rong.a.chen@intel.com Cc: Michal Hocko mhocko@kernel.org Cc: Shakeel Butt shakeelb@google.com Cc: Roman Gushchin guro@fb.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-pick commit from f5c6200d96e39a6dd80d397d54d6cd7a22c7d968) Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 26 ++++++++++++++++-------- mm/memcontrol.c | 41 ++++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 21 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 77af53f47afda..5425128cb2b93 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -126,9 +126,12 @@ struct memcg_shrinker_map { struct mem_cgroup_per_node { struct lruvec lruvec;
+ /* Legacy local VM stats */ + struct lruvec_stat __percpu *lruvec_stat_local; + + /* Subtree VM stats (batched updates) */ struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; - atomic_long_t lruvec_stat_local[NR_VM_NODE_STAT_ITEMS];
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
@@ -273,17 +276,18 @@ struct mem_cgroup { atomic_t moving_account; struct task_struct *move_lock_task;
- /* memory.stat */ + /* Legacy local VM stats and events */ + struct memcg_vmstats_percpu __percpu *vmstats_local; + + /* Subtree VM stats and events (batched updates) */ struct memcg_vmstats_percpu __percpu *vmstats_percpu;
MEMCG_PADDING(_pad2_);
atomic_long_t vmstats[MEMCG_NR_STAT]; - atomic_long_t vmstats_local[MEMCG_NR_STAT]; - atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; - atomic_long_t vmevents_local[NR_VM_EVENT_ITEMS];
+ /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
unsigned long socket_pressure; @@ -599,7 +603,11 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->vmstats_local[idx]); + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_local->stat[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -673,13 +681,15 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long x; + long x = 0; + int cpu;
if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat_local[idx]); + for_each_possible_cpu(cpu) + x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 90cf8e2ba15fc..518ba12ae4d53 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -695,11 +695,12 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return;
+ __this_cpu_add(memcg->vmstats_local->stat[idx], val); + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi;
- atomic_long_add(x, &memcg->vmstats_local[idx]); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmstats[idx]); x = 0; @@ -749,11 +750,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val);
/* Update lruvec */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi;
- atomic_long_add(x, &pn->lruvec_stat_local[idx]); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; @@ -775,11 +777,12 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return;
+ __this_cpu_add(memcg->vmstats_local->events[idx], count); + x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi;
- atomic_long_add(x, &memcg->vmevents_local[idx]); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmevents[idx]); x = 0; @@ -794,7 +797,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->vmevents_local[event]); + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_local->events[event], cpu); + return x; }
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -2225,11 +2233,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x;
x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); - if (x) { - atomic_long_add(x, &memcg->vmstats_local[i]); + if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &memcg->vmstats[i]); - }
if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2239,12 +2245,10 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
pn = mem_cgroup_nodeinfo(memcg, nid); x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) { - atomic_long_add(x, &pn->lruvec_stat_local[i]); + if (x) do { atomic_long_add(x, &pn->lruvec_stat[i]); } while ((pn = parent_nodeinfo(pn, nid))); - } } }
@@ -2252,11 +2256,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x;
x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); - if (x) { - atomic_long_add(x, &memcg->vmevents_local[i]); + if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &memcg->vmevents[i]); - } } }
@@ -4781,8 +4783,15 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1;
+ pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat_local) { + kfree(pn); + return 1; + } + pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); if (!pn->lruvec_stat_cpu) { + free_percpu(pn->lruvec_stat_local); kfree(pn); return 1; } @@ -4804,6 +4813,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return;
free_percpu(pn->lruvec_stat_cpu); + free_percpu(pn->lruvec_stat_local); kfree(pn); }
@@ -4815,6 +4825,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + free_percpu(memcg->vmstats_local);
memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); kfree(memcg_ext); @@ -4847,6 +4858,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail;
+ memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); + if (!memcg->vmstats_local) + goto fail; + memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); if (!memcg->vmstats_percpu) goto fail;
From: Yafang Shao laoar.shao@gmail.com
mainline inclusion from mainline-v5.3-rc1 commit dd9239900e12db84c198855b262ae7796db1123b category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
When we calculate total statistics for memcg1_stats and memcg1_events, we use the the index 'i' in the for loop as the events index. Actually we should use memcg1_stats[i] and memcg1_events[i] as the events index.
Link: http://lkml.kernel.org/r/1562116978-19539-1-git-send-email-laoar.shao@gmail.... Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty"). Signed-off-by: Yafang Shao <laoar.shao@gmail.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Yafang Shao shaoyafang@didiglobal.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-pick commit from e97c8caa68a68615f64d3a94eb3ec225a90e56b9) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 518ba12ae4d53..845757116e359 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3812,12 +3812,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)memcg_page_state(memcg, i) * PAGE_SIZE); + (u64)memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); }
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)memcg_events(memcg, i)); + (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v5.3-rc6 commit c350a99ea2b1b666c28948d74ab46c16913c28a7 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Percpu caching of local vmstats with the conditional propagation by the cgroup tree leads to an accumulation of errors on non-leaf levels.
Let's imagine two nested memory cgroups A and A/B. Say, a process belonging to A/B allocates 100 pagecache pages on the CPU 0. The percpu cache will spill 3 times, so that 32*3=96 pages will be accounted to A/B and A atomic vmstat counters, 4 pages will remain in the percpu cache.
Imagine A/B is nearby memory.max, so that every following allocation triggers a direct reclaim on the local CPU. Say, each such attempt will free 16 pages on a new cpu. That means every percpu cache will have -16 pages, except the first one, which will have 4 - 16 = -12. A/B and A atomic counters will not be touched at all.
Now a user removes A/B. All percpu caches are freed and corresponding vmstat numbers are forgotten. A has 96 pages more than expected.
As memory cgroups are created and destroyed, errors do accumulate. Even 1-2 pages differences can accumulate into large numbers.
To fix this issue let's accumulate and propagate percpu vmstat values before releasing the memory cgroup. At this point these numbers are stable and cannot be changed.
Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate only over online cpus.
Link: http://lkml.kernel.org/r/20190819202338.363363-2-guro@fb.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Roman Gushchin guro@fb.com Acked-by: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-from commit from c22087d02d52e521e348ec207b7434f3b10f0637) Conficts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 845757116e359..1316263f7562b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3210,6 +3210,41 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } }
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) +{ + unsigned long stat[MEMCG_NR_STAT]; + struct mem_cgroup *mi; + int node, cpu, i; + + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < MEMCG_NR_STAT; i++) + atomic_long_add(stat[i], &mi->vmstats[i]); + + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node *pi; + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] += raw_cpu_read( + pn->lruvec_stat_cpu->count[i]); + + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + atomic_long_add(stat[i], &pi->lruvec_stat[i]); + } +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -4823,6 +4858,11 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; struct mem_cgroup_extension *memcg_ext;
+ /* + * Flush percpu vmstats to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu);
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v5.3-rc6 commit bb65f89b7d3d305c14951f49860711fbcae70692 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Similar to vmstats, percpu caching of local vmevents leads to an accumulation of errors on non-leaf levels. This happens because some leftovers may remain in percpu caches, so that they are never propagated up by the cgroup tree and just disappear into nonexistence with on releasing of the memory cgroup.
To fix this issue let's accumulate and propagate percpu vmevents values before releasing the memory cgroup similar to what we're doing with vmstats.
Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate only over online cpus.
Link: http://lkml.kernel.org/r/20190819202338.363363-4-guro@fb.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Roman Gushchin guro@fb.com Acked-by: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-pick commit from 01d21e395a5b635a93820b4279d37f5cd6de6913) Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1316263f7562b..e618b1baab554 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3245,6 +3245,25 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) } }
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct mem_cgroup *mi; + int cpu, i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] += raw_cpu_read( + memcg->vmstats_percpu->events[i]); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + atomic_long_add(events[i], &mi->vmevents[i]); +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -4859,10 +4878,11 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) struct mem_cgroup_extension *memcg_ext;
/* - * Flush percpu vmstats to guarantee the value correctness + * Flush percpu vmstats and vmevents to guarantee the value correctness * on parent's and all ancestor levels. */ memcg_flush_percpu_vmstats(memcg); + memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu);
From: Yafang Shao laoar.shao@gmail.com
mainline inclusion from mainline-v5.3-rc1 commit 766a4c19d880887c457811b86f1f68525e416965 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
After commit 815744d75152 ("mm: memcontrol: don't batch updates of local VM stats and events"), the local VM counter are not in sync with the hierarchical ones.
Below is one example in a leaf memcg on my server (with 8 CPUs):
inactive_file 3567570944 total_inactive_file 3568029696
We find that the deviation is very great because the 'val' in __mod_memcg_state() is in pages while the effective value in memcg_stat_show() is in bytes.
So the maximum of this deviation between local VM stats and total VM stats can be (32 * number_of_cpu * PAGE_SIZE), that may be an unacceptably great value.
We should keep the local VM stats in sync with the total stats. In order to keep this behavior the same across counters, this patch updates __mod_lruvec_state() and __count_memcg_events() as well.
Link: http://lkml.kernel.org/r/1562851979-10610-1-git-send-email-laoar.shao@gmail.... Signed-off-by: Yafang Shao laoar.shao@gmail.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@kernel.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Yafang Shao shaoyafang@didiglobal.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-pick commit from 2f5bb3c19249c51a41395f9a4400ec05c002a60a) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e618b1baab554..cd9e34fbcfc52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -695,12 +695,15 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return;
- __this_cpu_add(memcg->vmstats_local->stat[idx], val); - x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi;
+ /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(memcg->vmstats_local->stat[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmstats[idx]); x = 0; @@ -749,13 +752,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg */ __mod_memcg_state(memcg, idx, val);
- /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi;
+ /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], x); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; @@ -777,12 +782,15 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return;
- __this_cpu_add(memcg->vmstats_local->events[idx], count); - x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi;
+ /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(memcg->vmstats_local->events[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmevents[idx]); x = 0;
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v5.3-rc7 commit b4c46484dc3fa3721d68fdfae85c1d7b1f6b5472 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Commit 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones") effectively decreased the precision of per-memcg vmstats_local and per-memcg-per-node lruvec percpu counters.
That's good for displaying in memory.stat, but brings a serious regression into the reclaim process.
One issue I've discovered and debugged is the following: lruvec_lru_size() can return 0 instead of the actual number of pages in the lru list, preventing the kernel to reclaim last remaining pages. Result is yet another dying memory cgroups flooding. The opposite is also happening: scanning an empty lru list is the waste of cpu time.
Also, inactive_list_is_low() can return incorrect values, preventing the active lru from being scanned and freed. It can fail both because the size of active and inactive lists are inaccurate, and because the number of workingset refaults isn't precise. In other words, the result is pretty random.
I'm not sure, if using the approximate number of slab pages in count_shadow_number() is acceptable, but issues described above are enough to partially revert the patch.
Let's keep per-memcg vmstat_local batched (they are only used for displaying stats to the userspace), but keep lruvec stats precise. This change fixes the dead memcg flooding on my setup.
Link: http://lkml.kernel.org/r/20190817004726.2530670-1-guro@fb.com Fixes: 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones") Signed-off-by: Roman Gushchin guro@fb.com Acked-by: Yafang Shao laoar.shao@gmail.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry-pick commit from 81b6bde6ce186b25297ffc0138f9bc4523f3497c) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cd9e34fbcfc52..d9f040773a473 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -752,15 +752,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg */ __mod_memcg_state(memcg, idx, val);
+ /* Update lruvec */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi;
- /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], x); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0;
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.3-rc7 commit 6c1c280805ded72eceb2afc1a0d431b256608554 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Instead of using raw_cpu_read() use per_cpu() to read the actual data of the corresponding cpu otherwise we will be reading the data of the current cpu for the number of online CPUs.
Link: http://lkml.kernel.org/r/20190829203110.129263-1-shakeelb@google.com Fixes: bb65f89b7d3d ("mm: memcontrol: flush percpu vmevents before releasing memcg") Fixes: c350a99ea2b1 ("mm: memcontrol: flush percpu vmstats before releasing memcg") Signed-off-by: Shakeel Butt shakeelb@google.com Acked-by: Roman Gushchin guro@fb.com Acked-by: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry pick commit from ff99cde5226f5ab3e032b870bec6b1631574bd0b) Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9f040773a473..33a1141be5e86 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3227,7 +3227,7 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
for_each_online_cpu(cpu) for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); + stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) for (i = 0; i < MEMCG_NR_STAT; i++) @@ -3242,8 +3242,8 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
for_each_online_cpu(cpu) for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - stat[i] += raw_cpu_read( - pn->lruvec_stat_cpu->count[i]); + stat[i] += per_cpu( + pn->lruvec_stat_cpu->count[i], cpu);
for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) @@ -3262,8 +3262,8 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
for_each_online_cpu(cpu) for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += raw_cpu_read( - memcg->vmstats_percpu->events[i]); + events[i] += per_cpu(memcg->vmstats_percpu->events[i], + cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
From: Honglei Wang honglei.wang@oracle.com
mainline inclusion from mainline-v5.4-rc4 commit b11edebbc967ebf5c55b8f9e1d5bb6d68ec3a7fd category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
Commit 1a61ab8038e72 ("mm: memcontrol: replace zone summing with lruvec_page_state()") has made lruvec_page_state to use per-cpu counters instead of calculating it directly from lru_zone_size with an idea that this would be more effective.
Tim has reported that this is not really the case for their database benchmark which is showing an opposite results where lruvec_page_state is taking up a huge chunk of CPU cycles (about 25% of the system time which is roughly 7% of total cpu cycles) on 5.3 kernels. The workload is running on a larger machine (96cpus), it has many cgroups (500) and it is heavily direct reclaim bound.
Tim Chen said:
: The problem can also be reproduced by running simple multi-threaded : pmbench benchmark with a fast Optane SSD swap (see profile below). : : : 6.15% 3.08% pmbench [kernel.vmlinux] [k] lruvec_lru_size : | : |--3.07%--lruvec_lru_size : | | : | |--2.11%--cpumask_next : | | | : | | --1.66%--find_next_bit : | | : | --0.57%--call_function_interrupt : | | : | --0.55%--smp_call_function_interrupt : | : |--1.59%--0x441f0fc3d009 : | _ops_rdtsc_init_base_freq : | access_histogram : | page_fault : | __do_page_fault : | handle_mm_fault : | __handle_mm_fault : | | : | --1.54%--do_swap_page : | swapin_readahead : | swap_cluster_readahead : | | : | --1.53%--read_swap_cache_async : | __read_swap_cache_async : | alloc_pages_vma : | __alloc_pages_nodemask : | __alloc_pages_slowpath : | try_to_free_pages : | do_try_to_free_pages : | shrink_node : | shrink_node_memcg : | | : | |--0.77%--lruvec_lru_size : | | : | --0.76%--inactive_list_is_low : | | : | --0.76%--lruvec_lru_size : | : --1.50%--measure_read : page_fault : __do_page_fault : handle_mm_fault : __handle_mm_fault : do_swap_page : swapin_readahead : swap_cluster_readahead : | : --1.48%--read_swap_cache_async : __read_swap_cache_async : alloc_pages_vma : __alloc_pages_nodemask : __alloc_pages_slowpath : try_to_free_pages : do_try_to_free_pages : shrink_node : shrink_node_memcg : | : |--0.75%--inactive_list_is_low : | | : | --0.75%--lruvec_lru_size : | : --0.73%--lruvec_lru_size
The likely culprit is the cache traffic the lruvec_page_state_local generates. Dave Hansen says:
: I was thinking purely of the cache footprint. If it's reading : pn->lruvec_stat_local->count[idx] is three separate cachelines, so 192 : bytes of cache *96 CPUs = 18k of data, mostly read-only. 1 cgroup would : be 18k of data for the whole system and the caching would be pretty : efficient and all 18k would probably survive a tight page fault loop in : the L1. 500 cgroups would be ~90k of data per CPU thread which doesn't : fit in the L1 and probably wouldn't survive a tight page fault loop if : both logical threads were banging on different cgroups. : : It's just a theory, but it's why I noted the number of cgroups when I : initially saw this show up in profiles
Fix the regression by partially reverting the said commit and calculate the lru size explicitly.
Link: http://lkml.kernel.org/r/20190905071034.16822-1-honglei.wang@oracle.com Fixes: 1a61ab8038e72 ("mm: memcontrol: replace zone summing with lruvec_page_state()") Signed-off-by: Honglei Wang honglei.wang@oracle.com Reported-by: Tim Chen tim.c.chen@linux.intel.com Acked-by: Tim Chen tim.c.chen@linux.intel.com Tested-by: Tim Chen tim.c.chen@linux.intel.com Acked-by: Michal Hocko mhocko@suse.com Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Roman Gushchin guro@fb.com Cc: Tejun Heo tj@kernel.org Cc: Dave Hansen dave.hansen@intel.com Cc: stable@vger.kernel.org [5.2+] Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry pick commit from 649cad91d7ad38cbd918a74f8128e7190b1cffb5) Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmscan.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index f3187be99d3d6..de03e899324e0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -352,12 +352,13 @@ unsigned long zone_reclaimable_pages(struct zone *zone) */ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { - unsigned long lru_size; + unsigned long lru_size = 0; int zid;
- if (!mem_cgroup_disabled()) - lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); - else + if (!mem_cgroup_disabled()) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) + lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + } else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.4-rc7 commit 7961eee3978475fd9e8626137f88595b1ca05856 category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
-------------------------------------------------
__mem_cgroup_free() can be called on the failure path in mem_cgroup_alloc(). However memcg_flush_percpu_vmstats() and memcg_flush_percpu_vmevents() which are called from __mem_cgroup_free() access the fields of memcg which can potentially be null if called from failure path from mem_cgroup_alloc(). Indeed syzbot has reported the following crash:
kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 30393 Comm: syz-executor.1 Not tainted 5.4.0-rc2+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:memcg_flush_percpu_vmstats+0x4ae/0x930 mm/memcontrol.c:3436 Code: 05 41 89 c0 41 0f b6 04 24 41 38 c7 7c 08 84 c0 0f 85 5d 03 00 00 44 3b 05 33 d5 12 08 0f 83 e2 00 00 00 4c 89 f0 48 c1 e8 03 <42> 80 3c 28 00 0f 85 91 03 00 00 48 8b 85 10 fe ff ff 48 8b b0 90 RSP: 0018:ffff888095c27980 EFLAGS: 00010206 RAX: 0000000000000012 RBX: ffff888095c27b28 RCX: ffffc90008192000 RDX: 0000000000040000 RSI: ffffffff8340fae7 RDI: 0000000000000007 RBP: ffff888095c27be0 R08: 0000000000000000 R09: ffffed1013f0da33 R10: ffffed1013f0da32 R11: ffff88809f86d197 R12: fffffbfff138b760 R13: dffffc0000000000 R14: 0000000000000090 R15: 0000000000000007 FS: 00007f5027170700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000710158 CR3: 00000000a7b18000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __mem_cgroup_free+0x1a/0x190 mm/memcontrol.c:5021 mem_cgroup_free mm/memcontrol.c:5033 [inline] mem_cgroup_css_alloc+0x3a1/0x1ae0 mm/memcontrol.c:5160 css_create kernel/cgroup/cgroup.c:5156 [inline] cgroup_apply_control_enable+0x44d/0xc40 kernel/cgroup/cgroup.c:3119 cgroup_mkdir+0x899/0x11b0 kernel/cgroup/cgroup.c:5401 kernfs_iop_mkdir+0x14d/0x1d0 fs/kernfs/dir.c:1124 vfs_mkdir+0x42e/0x670 fs/namei.c:3807 do_mkdirat+0x234/0x2a0 fs/namei.c:3830 __do_sys_mkdir fs/namei.c:3846 [inline] __se_sys_mkdir fs/namei.c:3844 [inline] __x64_sys_mkdir+0x5c/0x80 fs/namei.c:3844 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Fixing this by moving the flush to mem_cgroup_free as there is no need to flush anything if we see failure in mem_cgroup_alloc().
Link: http://lkml.kernel.org/r/20191018165231.249872-1-shakeelb@google.com Fixes: bb65f89b7d3d ("mm: memcontrol: flush percpu vmevents before releasing memcg") Fixes: c350a99ea2b1 ("mm: memcontrol: flush percpu vmstats before releasing memcg") Signed-off-by: Shakeel Butt shakeelb@google.com Reported-by: syzbot+515d5bcfe179cdf049b2@syzkaller.appspotmail.com Reviewed-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com (cherry pick commit from b04fca9a1b9e2668a999dac3897c15fb8f4bb093) Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33a1141be5e86..37b4fa1b4ea36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4883,12 +4883,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; struct mem_cgroup_extension *memcg_ext;
- /* - * Flush percpu vmstats and vmevents to guarantee the value correctness - * on parent's and all ancestor levels. - */ - memcg_flush_percpu_vmstats(memcg); - memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); @@ -4901,6 +4895,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { memcg_wb_domain_exit(memcg); + /* + * Flush percpu vmstats and vmevents to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg); + memcg_flush_percpu_vmevents(memcg); __mem_cgroup_free(memcg); }
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: bugfix bugzilla: 51815, https://gitee.com/openeuler/kernel/issues/I3IJ9I CVE: NA
--------
1) Rename struct mem_vmstats_percpu to struct mem_cgroup_stat_cpu 2) Move lruvec_stat_local from mem_cgroup_stat_cpu to mem_cgroup_per_node_extension 3) Rename vmstats and vmevents to stat and events in struct mem_cgroup 4) Rename vmstats_percpu to stat_cpu in struct mem_cgroup 5) Move vmstats_local from struct mem_cgroup to struct mem_cgroup_extension
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memcontrol.h | 36 ++++++++++------ mm/memcontrol.c | 88 ++++++++++++++++++++++---------------- 2 files changed, 72 insertions(+), 52 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5425128cb2b93..c7c7c0a418771 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,8 +94,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, };
-struct memcg_vmstats_percpu { - long stat[MEMCG_NR_STAT]; +struct mem_cgroup_stat_cpu { + long count[MEMCG_NR_STAT]; unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; @@ -126,9 +126,6 @@ struct memcg_shrinker_map { struct mem_cgroup_per_node { struct lruvec lruvec;
- /* Legacy local VM stats */ - struct lruvec_stat __percpu *lruvec_stat_local; - /* Subtree VM stats (batched updates) */ struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; @@ -150,6 +147,14 @@ struct mem_cgroup_per_node { /* use container_of */ };
+struct mem_cgroup_per_node_extension { + struct mem_cgroup_per_node pn; + /* Legacy local VM stats */ + struct lruvec_stat __percpu *lruvec_stat_local; +}; + +#define to_mgpn_ext(pn) container_of(pn, struct mem_cgroup_per_node_extension, pn) + struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; @@ -276,16 +281,13 @@ struct mem_cgroup { atomic_t moving_account; struct task_struct *move_lock_task;
- /* Legacy local VM stats and events */ - struct memcg_vmstats_percpu __percpu *vmstats_local; - /* Subtree VM stats and events (batched updates) */ - struct memcg_vmstats_percpu __percpu *vmstats_percpu; + struct mem_cgroup_stat_cpu __percpu *stat_cpu;
MEMCG_PADDING(_pad2_);
- atomic_long_t vmstats[MEMCG_NR_STAT]; - atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; + atomic_long_t stat[MEMCG_NR_STAT]; + atomic_long_t events[NR_VM_EVENT_ITEMS];
/* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -330,6 +332,8 @@ struct mem_cgroup_extension { */ int memcg_priority; #endif + /* Legacy local VM stats and events */ + struct mem_cgroup_stat_cpu __percpu *vmstats_local; spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; @@ -588,7 +592,7 @@ void unlock_page_memcg(struct page *page); */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->vmstats[idx]); + long x = atomic_long_read(&memcg->stat[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -605,9 +609,11 @@ static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, { long x = 0; int cpu; + struct mem_cgroup_extension *mgext;
+ mgext = to_memcg_ext(memcg); for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->stat[idx], cpu); + x += per_cpu(mgext->vmstats_local->count[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -681,6 +687,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; + struct mem_cgroup_per_node_extension *pnext; long x = 0; int cpu;
@@ -688,8 +695,9 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + pnext = to_mgpn_ext(pn); for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); + x += per_cpu(pnext->lruvec_stat_local->count[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 37b4fa1b4ea36..da10300a6e7d7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -695,20 +695,22 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return;
- x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); + x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; + struct mem_cgroup_extension *mgext;
/* * Batch local counters to keep them in sync with * the hierarchical ones. */ - __this_cpu_add(memcg->vmstats_local->stat[idx], x); + mgext = to_memcg_ext(memcg); + __this_cpu_add(mgext->vmstats_local->count[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmstats[idx]); + atomic_long_add(x, &mi->stat[idx]); x = 0; } - __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); + __this_cpu_write(memcg->stat_cpu->count[idx], x); }
static struct mem_cgroup_per_node * @@ -737,6 +739,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, { pg_data_t *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup_per_node *pn; + struct mem_cgroup_per_node_extension *pnext; struct mem_cgroup *memcg; long x;
@@ -753,7 +756,8 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val);
/* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + pnext = to_mgpn_ext(pn); + __this_cpu_add(pnext->lruvec_stat_local->count[idx], val);
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { @@ -780,34 +784,38 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return;
- x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); + x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; + struct mem_cgroup_extension *mgext;
/* * Batch local counters to keep them in sync with * the hierarchical ones. */ - __this_cpu_add(memcg->vmstats_local->events[idx], x); + mgext = to_memcg_ext(memcg); + __this_cpu_add(mgext->vmstats_local->events[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmevents[idx]); + atomic_long_add(x, &mi->events[idx]); x = 0; } - __this_cpu_write(memcg->vmstats_percpu->events[idx], x); + __this_cpu_write(memcg->stat_cpu->events[idx], x); }
static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->vmevents[event]); + return atomic_long_read(&memcg->events[event]); }
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { long x = 0; int cpu; + struct mem_cgroup_extension *mgext;
+ mgext = to_memcg_ext(memcg); for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->events[event], cpu); + x += per_cpu(mgext->vmstats_local->events[event], cpu); return x; }
@@ -840,7 +848,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ }
- __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); + __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); }
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -848,8 +856,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next;
- val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); - next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); + val = __this_cpu_read(memcg->stat_cpu->nr_page_events); + next = __this_cpu_read(memcg->stat_cpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -865,7 +873,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->vmstats_percpu->targets[target], next); + __this_cpu_write(memcg->stat_cpu->targets[target], next); return true; } return false; @@ -2238,10 +2246,10 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) int nid; long x;
- x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); + x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmstats[i]); + atomic_long_add(x, &memcg->stat[i]);
if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2261,10 +2269,10 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x;
- x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); + x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmevents[i]); + atomic_long_add(x, &memcg->events[i]); } }
@@ -3227,11 +3235,11 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
for_each_online_cpu(cpu) for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); + stat[i] += per_cpu(memcg->stat_cpu->count[i], cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) for (i = 0; i < MEMCG_NR_STAT; i++) - atomic_long_add(stat[i], &mi->vmstats[i]); + atomic_long_add(stat[i], &mi->stat[i]);
for_each_node(node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; @@ -3262,12 +3270,12 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
for_each_online_cpu(cpu) for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += per_cpu(memcg->vmstats_percpu->events[i], + events[i] += per_cpu(memcg->stat_cpu->events[i], cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - atomic_long_add(events[i], &mi->vmevents[i]); + atomic_long_add(events[i], &mi->events[i]); }
#ifdef CONFIG_MEMCG_KMEM @@ -4323,11 +4331,11 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) */ static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->vmstats[idx]); + long x = atomic_long_read(&memcg->stat[idx]); int cpu;
for_each_online_cpu(cpu) - x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; if (x < 0) x = 0; return x; @@ -4829,6 +4837,7 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; + struct mem_cgroup_per_node_extension *pnext; int tmp = node; /* * This routine is called against possible nodes. @@ -4844,15 +4853,16 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1;
- pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat); - if (!pn->lruvec_stat_local) { - kfree(pn); + pnext = to_mgpn_ext(pn); + pnext->lruvec_stat_local = alloc_percpu(struct lruvec_stat); + if (!pnext->lruvec_stat_local) { + kfree(pnext); return 1; }
pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); if (!pn->lruvec_stat_cpu) { - free_percpu(pn->lruvec_stat_local); + free_percpu(pnext->lruvec_stat_local); kfree(pn); return 1; } @@ -4869,12 +4879,14 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node_extension *pnext;
if (!pn) return;
free_percpu(pn->lruvec_stat_cpu); - free_percpu(pn->lruvec_stat_local); + pnext = to_mgpn_ext(pn); + free_percpu(pnext->lruvec_stat_local); kfree(pn); }
@@ -4883,12 +4895,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; struct mem_cgroup_extension *memcg_ext;
+ memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->vmstats_percpu); - free_percpu(memcg->vmstats_local); + free_percpu(memcg->stat_cpu); + free_percpu(memcg_ext->vmstats_local);
- memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); kfree(memcg_ext); }
@@ -4925,12 +4937,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail;
- memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); - if (!memcg->vmstats_local) + memcg_ext->vmstats_local = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg_ext->vmstats_local) goto fail;
- memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); - if (!memcg->vmstats_percpu) + memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat_cpu) goto fail;
for_each_node(node) @@ -6576,7 +6588,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags);