From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-5.3-rc1 commit fb2f2b0adb98bbbbbb51c5a5327f3f90f5dc417e category: bugfix bugzilla: 34611 CVE: NA
------------------------------------------------- Let's reparent non-root kmem_caches on memcg offlining. This allows us to release the memory cgroup without waiting for the last outstanding kernel object (e.g. dentry used by another application).
Since the parent cgroup is already charged, everything we need to do is to splice the list of kmem_caches to the parent's kmem_caches list, swap the memcg pointer, drop the css refcounter for each kmem_cache and adjust the parent's css refcounter.
Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer anymore. It's safe to read it under rcu_read_lock(), cgroup_mutex held, or any other way that protects the memory cgroup from being released.
We can race with the slab allocation and deallocation paths. It's not a big problem: parent's charge and slab global stats are always correct, and we don't care anymore about the child usage and global stats. The child cgroup is already offline, so we don't use or show it anywhere.
Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't used anywhere except count_shadow_nodes(). But even there it won't break anything: after reparenting "nodes" will be 0 on child level (because we're already reparenting shrinker lists), and on parent level page stats always were 0, and this patch won't change anything.
[guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup] Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com Signed-off-by: Roman Gushchin guro@fb.com Acked-by: Vladimir Davydov vdavydov.dev@gmail.com Reviewed-by: Shakeel Butt shakeelb@google.com Acked-by: David Rientjes rientjes@google.com Cc: Christoph Lameter cl@linux.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Cc: Waiman Long longman@redhat.com Cc: David Rientjes rientjes@google.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Pekka Enberg penberg@kernel.org Cc: Andrei Vagin avagin@gmail.com Cc: Qian Cai cai@lca.pw Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit fb2f2b0adb98bbbbbb51c5a5327f3f90f5dc417e) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/slab.h | 2 +- mm/memcontrol.c | 14 ++++++++------ mm/slab.h | 41 ++++++++++++++++++++++++++++++++--------- mm/slab_common.c | 19 +++++++++++++++++-- 4 files changed, 58 insertions(+), 18 deletions(-)
diff --git a/include/linux/slab.h b/include/linux/slab.h index b3812235d59d..d79bd66b4aaa 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *);
void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); -void memcg_deactivate_kmem_caches(struct mem_cgroup *); +void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
/* * Please use this macro to create slab caches. Simply specify the diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2a1e51175e91..ab33121bf706 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3228,15 +3228,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg->kmem_state = KMEM_ALLOCATED;
- memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup;
+ memcg_deactivate_kmem_caches(memcg, parent); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus @@ -3269,7 +3269,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) if (memcg->kmem_state == KMEM_ALLOCATED) { WARN_ON(!list_empty(&memcg->kmem_caches)); static_branch_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); } } #else @@ -4664,6 +4663,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* The following stuff does not apply to the root */ if (!parent) { +#ifdef CONFIG_MEMCG_KMEM + INIT_LIST_HEAD(&memcg->kmem_caches); +#endif root_mem_cgroup = memcg; return &memcg->css; } diff --git a/mm/slab.h b/mm/slab.h index b26e30b88f08..77fd31ea6877 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -261,6 +261,9 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) * which do not have slab_cache pointer set. * So this function assumes that the page can pass PageHead() and PageSlab() * checks. + * + * The kmem_cache can be reparented asynchronously. The caller must ensure + * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex. */ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) { @@ -268,7 +271,7 @@ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
s = READ_ONCE(page->slab_cache); if (s && !is_root_cache(s)) - return s->memcg_params.memcg; + return READ_ONCE(s->memcg_params.memcg);
return NULL; } @@ -285,10 +288,22 @@ static __always_inline int memcg_charge_slab(struct page *page, struct lruvec *lruvec; int ret;
- memcg = s->memcg_params.memcg; + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + while (memcg && !css_tryget_online(&memcg->css)) + memcg = parent_mem_cgroup(memcg); + rcu_read_unlock(); + + if (unlikely(!memcg || mem_cgroup_is_root(memcg))) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + (1 << order)); + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + return 0; + } + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (ret) - return ret; + goto out;
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); @@ -296,8 +311,9 @@ static __always_inline int memcg_charge_slab(struct page *page, /* transer try_charge() page references to kmem_cache */ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); css_put_many(&memcg->css, 1 << order); - - return 0; +out: + css_put(&memcg->css); + return ret; }
/* @@ -310,10 +326,17 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct mem_cgroup *memcg; struct lruvec *lruvec;
- memcg = s->memcg_params.memcg; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); - mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); - memcg_kmem_uncharge_memcg(page, order, memcg); + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + if (likely(!mem_cgroup_is_root(memcg))) { + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); + memcg_kmem_uncharge_memcg(page, order, memcg); + } else { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + } + rcu_read_unlock();
percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 0867f6656858..a7d804122140 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -252,7 +252,8 @@ static void memcg_unlink_cache(struct kmem_cache *s) } else { list_del(&s->memcg_params.children_node); list_del(&s->memcg_params.kmem_caches_node); - css_put(&s->memcg_params.memcg->css); + mem_cgroup_put(s->memcg_params.memcg); + WRITE_ONCE(s->memcg_params.memcg, NULL); } } #else @@ -766,11 +767,13 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s) spin_unlock_irq(&memcg_kmem_wq_lock); }
-void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) +void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg, + struct mem_cgroup *parent) { int idx; struct memcg_cache_array *arr; struct kmem_cache *s, *c; + unsigned int nr_reparented;
idx = memcg_cache_id(memcg);
@@ -788,6 +791,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } + nr_reparented = 0; + list_for_each_entry(s, &memcg->kmem_caches, + memcg_params.kmem_caches_node) { + WRITE_ONCE(s->memcg_params.memcg, parent); + css_put(&memcg->css); + nr_reparented++; + } + if (nr_reparented) { + list_splice_init(&memcg->kmem_caches, + &parent->kmem_caches); + css_get_many(&parent->css, nr_reparented); + } mutex_unlock(&slab_mutex);
put_online_mems();