Re: [PATCH OLK-5.10] memcg: introduce memcg early oom feature

5 Sep
2025
5 Sep
'25
5:26 p.m.
On
2025/9/4 15:52, wangkefeng.wang wrote:
>> meituan inclusion
>> category: feature
>> bugzilla: https://gitee.com/openeuler/kernel/issues/ICTXDJ
>> CVE: NA
>>
>> --------------------------------
>>
>> Introduce memcg early oom feature to trigger OOM killer earlier;
>> this feature is disabled by default.
>
>
>Could we add a mechanism like file_is_tiny for cgroup ?
>
>>
>> Signed-off-by: Zhao Xuedong <zhaoxuedong@meituan.com>
>> ---
>> mm/Kconfig | 14 ++++++++++
>> mm/vmscan.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 87 insertions(+)
>>
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index cc43f5124cb3..89bcb73b6a5b 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -521,6 +521,20 @@ config MEMCG_SWAP_QOS
>> memcg swap control include memory force swapin, swapfile control
>> and swap limit.
>>
>> +config MEMCG_EARLY_OOM
>> + bool "Enable aggressive memcg OOM killing under memory pressure"
>> + depends on MEMCG
>> + depends on X86 || ARM64
>> + default n
>> + help
>> + MEMCG_EARLY_OOM makes memory cgroups trigger OOM killer earlier
>> + and more aggressively when under memory pressure, rather than
>> + attempting to reclaim very small amounts of file pages through
>> + prolonged reclaim attempts.
>> +
>> + Say "y" if you prefer fast OOM kills over prolonged reclaim
>> + attempts.
>> +
>> config ETMEM_SCAN
>> tristate "module: etmem page scan for etmem support"
>> depends on ETMEM
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index e82d7995b548..622122473f06 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -2470,11 +2470,62 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
>> return inactive * inactive_ratio < active;
>> }
>>
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> +/* Check if swap usage is over the limit for cgroupv1. */
>> +static bool is_swap_over_limit(struct mem_cgroup *memcg)
>> +{
>> + unsigned long mem_limit = READ_ONCE(memcg->memory.max);
>> + unsigned long memsw_limit = READ_ONCE(memcg->memsw.max);
>> +
>> + if (memsw_limit <= mem_limit)
>> + return false;
>> +
>> + return (page_counter_read(&memcg->memsw) -
>> + page_counter_read(&memcg->memory)) >
>> + (memsw_limit - mem_limit);
>> +}
>> +
>> +/*
>> + * Check if file cache is too small to reclaim and anonymous pages are reclaimable.
>> + * Returns true if:
>> + * 1. File cache (+ free space) is below the minimum threshold (pages_min), AND
>> + * 2. Anonymous pages are allowed to be deactivated, AND
>> + * 3. Anonymous pages are abundant relative to reclaim priority
>> + */
>> +static bool memcg_should_skip_file_reclaim(struct mem_cgroup *memcg,
>> + struct scan_control *sc,
>> + struct lruvec *lruvec)
>> +{
>> + unsigned long file, anon, free;
>> + unsigned long mem_limit, memsw_usage, mem_high;
>> + unsigned long pages_min;
>> +
>> + if (!cgroup_reclaim(sc))
>> + return false;
>> +
>> + file = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) +
>> + lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, sc->reclaim_idx);
>> + mem_limit = READ_ONCE(memcg->memory.max);
>> + memsw_usage = page_counter_read(&memcg->memsw);
>> + mem_high = READ_ONCE(memcg->memory.high);
>> + anon = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx);
>> + free = mem_limit > memsw_usage ? mem_limit - memsw_usage : 0;
>> + pages_min = mem_limit > mem_high ? (mem_limit - mem_high) >> 2 : 0;
>> +
>> + return (file + free <= pages_min) &&
>> + !(sc->may_deactivate & DEACTIVATE_ANON) &&
>> + (anon >> sc->priority);
>> +}
>> +#endif
>> +
>> enum scan_balance {
>> SCAN_EQUAL,
>> SCAN_FRACT,
>> SCAN_ANON,
>> SCAN_FILE,
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> + SCAN_NONE,
>> +#endif
>> };
>>
>> /*
>> @@ -2498,6 +2549,20 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>> unsigned long ap, fp;
>> enum lru_list lru;
>>
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> + /*
>> + * if both file and anon pages are deemed non-reclaimable,
>> + * we deliberately stop reclaiming early to trigger OOM killer
>> + * faster.
>> + */
>> + if (cgroup_reclaim(sc) &&
>> + is_swap_over_limit(memcg) &&
>> + memcg_should_skip_file_reclaim(memcg, sc, lruvec)) {
>> + scan_balance = SCAN_NONE;
>> + goto out;
>> + }
>> +#endif
>> +
>
>Maybe check file in prepare_scan_count() like global reclaim ?
>
> if (!cgroup_reclaim(sc)) {
> sc->file_is_tiny = global file check
>} else {
> sc->file_is_tniy = cgroup file check
>}
>
>
>
>
>> if (sc->not_file) {
>> scan_balance = SCAN_ANON;
>> goto out;
>> @@ -2543,7 +2608,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>> /*
>> * If the system is almost out of file pages, force-scan anon.
>> */
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> + if (sc->file_is_tiny ||
>> + memcg_should_skip_file_reclaim(memcg, sc, lruvec)) {
>> +#else
>> if (sc->file_is_tiny) {
>> +#endif
>> scan_balance = SCAN_ANON;
>> goto out;
>> }
>> @@ -2687,6 +2757,9 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>> if ((scan_balance == SCAN_FILE) != file)
>> scan = 0;
>> break;
>> + case SCAN_NONE:
>> + scan = 0;
>> + break;
>> default:
>> /* Look ma, no brain */
>> BUG();
>> meituan inclusion
>> category: feature
>> bugzilla: https://gitee.com/openeuler/kernel/issues/ICTXDJ
>> CVE: NA
>>
>> --------------------------------
>>
>> Introduce memcg early oom feature to trigger OOM killer earlier;
>> this feature is disabled by default.
>
>
>Could we add a mechanism like file_is_tiny for cgroup ?
>
>>
>> Signed-off-by: Zhao Xuedong <zhaoxuedong@meituan.com>
>> ---
>> mm/Kconfig | 14 ++++++++++
>> mm/vmscan.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 87 insertions(+)
>>
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index cc43f5124cb3..89bcb73b6a5b 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -521,6 +521,20 @@ config MEMCG_SWAP_QOS
>> memcg swap control include memory force swapin, swapfile control
>> and swap limit.
>>
>> +config MEMCG_EARLY_OOM
>> + bool "Enable aggressive memcg OOM killing under memory pressure"
>> + depends on MEMCG
>> + depends on X86 || ARM64
>> + default n
>> + help
>> + MEMCG_EARLY_OOM makes memory cgroups trigger OOM killer earlier
>> + and more aggressively when under memory pressure, rather than
>> + attempting to reclaim very small amounts of file pages through
>> + prolonged reclaim attempts.
>> +
>> + Say "y" if you prefer fast OOM kills over prolonged reclaim
>> + attempts.
>> +
>> config ETMEM_SCAN
>> tristate "module: etmem page scan for etmem support"
>> depends on ETMEM
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index e82d7995b548..622122473f06 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -2470,11 +2470,62 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
>> return inactive * inactive_ratio < active;
>> }
>>
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> +/* Check if swap usage is over the limit for cgroupv1. */
>> +static bool is_swap_over_limit(struct mem_cgroup *memcg)
>> +{
>> + unsigned long mem_limit = READ_ONCE(memcg->memory.max);
>> + unsigned long memsw_limit = READ_ONCE(memcg->memsw.max);
>> +
>> + if (memsw_limit <= mem_limit)
>> + return false;
>> +
>> + return (page_counter_read(&memcg->memsw) -
>> + page_counter_read(&memcg->memory)) >
>> + (memsw_limit - mem_limit);
>> +}
>> +
>> +/*
>> + * Check if file cache is too small to reclaim and anonymous pages are reclaimable.
>> + * Returns true if:
>> + * 1. File cache (+ free space) is below the minimum threshold (pages_min), AND
>> + * 2. Anonymous pages are allowed to be deactivated, AND
>> + * 3. Anonymous pages are abundant relative to reclaim priority
>> + */
>> +static bool memcg_should_skip_file_reclaim(struct mem_cgroup *memcg,
>> + struct scan_control *sc,
>> + struct lruvec *lruvec)
>> +{
>> + unsigned long file, anon, free;
>> + unsigned long mem_limit, memsw_usage, mem_high;
>> + unsigned long pages_min;
>> +
>> + if (!cgroup_reclaim(sc))
>> + return false;
>> +
>> + file = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) +
>> + lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, sc->reclaim_idx);
>> + mem_limit = READ_ONCE(memcg->memory.max);
>> + memsw_usage = page_counter_read(&memcg->memsw);
>> + mem_high = READ_ONCE(memcg->memory.high);
>> + anon = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx);
>> + free = mem_limit > memsw_usage ? mem_limit - memsw_usage : 0;
>> + pages_min = mem_limit > mem_high ? (mem_limit - mem_high) >> 2 : 0;
>> +
>> + return (file + free <= pages_min) &&
>> + !(sc->may_deactivate & DEACTIVATE_ANON) &&
>> + (anon >> sc->priority);
>> +}
>> +#endif
>> +
>> enum scan_balance {
>> SCAN_EQUAL,
>> SCAN_FRACT,
>> SCAN_ANON,
>> SCAN_FILE,
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> + SCAN_NONE,
>> +#endif
>> };
>>
>> /*
>> @@ -2498,6 +2549,20 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>> unsigned long ap, fp;
>> enum lru_list lru;
>>
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> + /*
>> + * if both file and anon pages are deemed non-reclaimable,
>> + * we deliberately stop reclaiming early to trigger OOM killer
>> + * faster.
>> + */
>> + if (cgroup_reclaim(sc) &&
>> + is_swap_over_limit(memcg) &&
>> + memcg_should_skip_file_reclaim(memcg, sc, lruvec)) {
>> + scan_balance = SCAN_NONE;
>> + goto out;
>> + }
>> +#endif
>> +
>
>Maybe check file in prepare_scan_count() like global reclaim ?
>
> if (!cgroup_reclaim(sc)) {
> sc->file_is_tiny = global file check
>} else {
> sc->file_is_tniy = cgroup file check
>}
>
>
This patch appears inspired by the file_is_tiny approach,
there are two considerations regarding its application to memcg reclaim:
1.Is placing file_is_tiny in scan_control (sc) appropriate for memcg reclaim?
Since sc represents the reclaim policy that propagates to both target memcg
and all its children,
this might be too broad.
2.Where should we actually set file_is_tiny?
If we keep it in sc and set it during prepare_scan_count,
it would enforce the same policy across both target memcg and its descendants
- even when some of them actually maintain sufficient file pages. This seems suboptimal.
>
>
>> if (sc->not_file) {
>> scan_balance = SCAN_ANON;
>> goto out;
>> @@ -2543,7 +2608,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>> /*
>> * If the system is almost out of file pages, force-scan anon.
>> */
>> +#ifdef CONFIG_MEMCG_EARLY_OOM
>> + if (sc->file_is_tiny ||
>> + memcg_should_skip_file_reclaim(memcg, sc, lruvec)) {
>> +#else
>> if (sc->file_is_tiny) {
>> +#endif
>> scan_balance = SCAN_ANON;
>> goto out;
>> }
>> @@ -2687,6 +2757,9 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>> if ((scan_balance == SCAN_FILE) != file)
>> scan = 0;
>> break;
>> + case SCAN_NONE:
>> + scan = 0;
>> + break;
>> default:
>> /* Look ma, no brain */
>> BUG();
0
Age (days ago)
0
Last active (days ago)
0 comments
1 participants
participants (1)
-
赵学栋