 
            From: Yu Zhao <yuzhao@google.com> stable inclusion from stable-v6.6.8 commit 6b131c2a2875682bc5ea0d4b15ba76775aa3ea80 bugzilla: https://gitee.com/openeuler/kernel/issues/I99K53 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=... -------------------------------- commit 8aa420617918d12d1f5d55030a503c9418e73c2c upstream. While investigating kswapd "consuming 100% CPU" [1] (also see "mm/mglru: try to stop at high watermarks"), it was discovered that the memcg LRU can breach the thrashing protection imposed by min_ttl_ms. Before the memcg LRU: kswapd() shrink_node_memcgs() mem_cgroup_iter() inc_max_seq() // always hit a different memcg lru_gen_age_node() mem_cgroup_iter() check the timestamp of the oldest generation After the memcg LRU: kswapd() shrink_many() restart: iterate the memcg LRU: inc_max_seq() // occasionally hit the same memcg if raced with lru_gen_rotate_memcg(): goto restart lru_gen_age_node() mem_cgroup_iter() check the timestamp of the oldest generation Specifically, when the restart happens in shrink_many(), it needs to stick with the (memcg LRU) generation it began with. In other words, it should neither re-read memcg_lru->seq nor age an lruvec of a different generation. Otherwise it can hit the same memcg multiple times without giving lru_gen_age_node() a chance to check the timestamp of that memcg's oldest generation (against min_ttl_ms). [1] https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@... Link: https://lkml.kernel.org/r/20231208061407.2125867-3-yuzhao@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Yu Zhao <yuzhao@google.com> Tested-by: T.J. Mercier <tjmercier@google.com> Cc: Charan Teja Kalla <quic_charante@quicinc.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com> Cc: Kairui Song <ryncsn@gmail.com> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: ZhangPeng <zhangpeng362@huawei.com> --- include/linux/mmzone.h | 30 +++++++++++++++++------------- mm/vmscan.c | 30 ++++++++++++++++-------------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 655157b32fa6..e9dbaae315bf 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -514,33 +514,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); * the old generation, is incremented when all its bins become empty. * * There are four operations: - * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; - * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; - * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; - * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; - * 2. The first attempt to reclaim an memcg below low, which triggers + * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; - * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * 3. The first attempt to reclaim a memcg below reclaimable size threshold, * which triggers MEMCG_LRU_TAIL; - * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * 4. The second attempt to reclaim a memcg below reclaimable size threshold, * which triggers MEMCG_LRU_YOUNG; - * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; - * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * - * Note that memcg LRU only applies to global reclaim, and the round-robin - * incrementing of their max_seq counters ensures the eventual fairness to all - * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + * Notes: + * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing + * of their max_seq counters ensures the eventual fairness to all eligible + * memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + * 2. There are only two valid generations: old (seq) and young (seq+1). + * MEMCG_NR_GENS is set to three so that when reading the generation counter + * locklessly, a stale value (seq-1) does not wraparound to young. */ -#define MEMCG_NR_GENS 2 +#define MEMCG_NR_GENS 3 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { diff --git a/mm/vmscan.c b/mm/vmscan.c index 4266401df75e..38dee3f8f631 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4810,6 +4810,9 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) else VM_WARN_ON_ONCE(true); + WRITE_ONCE(lruvec->lrugen.seg, seg); + WRITE_ONCE(lruvec->lrugen.gen, new); + hlist_nulls_del_rcu(&lruvec->lrugen.list); if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) @@ -4820,9 +4823,6 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) pgdat->memcg_lru.nr_memcgs[old]--; pgdat->memcg_lru.nr_memcgs[new]++; - lruvec->lrugen.gen = new; - WRITE_ONCE(lruvec->lrugen.seg, seg); - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); @@ -4845,11 +4845,11 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg) gen = get_memcg_gen(pgdat->memcg_lru.seq); + lruvec->lrugen.gen = gen; + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); pgdat->memcg_lru.nr_memcgs[gen]++; - lruvec->lrugen.gen = gen; - spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -5348,7 +5348,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool DEFINE_MAX_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) - return 0; + return -1; if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) return nr_to_scan; @@ -5423,7 +5423,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) cond_resched(); } - /* whether try_to_inc_max_seq() was successful */ + /* whether this lruvec should be rotated */ return nr_to_scan < 0; } @@ -5477,13 +5477,13 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) struct lruvec *lruvec; struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; - const struct hlist_nulls_node *pos; + struct hlist_nulls_node *pos; + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: op = 0; memcg = NULL; - gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); rcu_read_lock(); @@ -5494,6 +5494,10 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) } mem_cgroup_put(memcg); + memcg = NULL; + + if (gen != READ_ONCE(lrugen->gen)) + continue; lruvec = container_of(lrugen, struct lruvec, lrugen); memcg = lruvec_memcg(lruvec); @@ -5578,16 +5582,14 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) return; /* - * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> - * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the - * estimated reclaimed_to_scanned_ratio = inactive / total. + * Determine the initial priority based on + * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, + * where reclaimed_to_scanned_ratio = inactive / total. */ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_swappiness(lruvec, sc)) reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); - reclaimable /= MEMCG_NR_GENS; - /* round down reclaimable and round up sc->nr_to_reclaim */ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); -- 2.25.1