[PATCH openEuler-1.0-LTS 1/4] mm: add ZRAM-Reclaim infrastructure and policy core

9 Jun 2026

hulk inclusion
category: feature

--------------------------------

Introduce CONFIG_ZRAM_RECLAIM, a memcg-level reclaim policy for
ZRAM + HDD environments where anonymous swapins from zram are much
cheaper than file refaults from slow backing storage.

The policy tracks anon swapin and file refault costs per memcg and
uses the sampled costs to bias reclaim:

  - protect cache by selecting SCAN_ANON while zram is not overloaded;
  - raise swappiness to ZRAM_OVERLOAD_SWAPPINESS when anon cost is
    high enough to indicate zram overload;
  - skip the policy in near-OOM, cache-heavy, or over-limit swap usage
    cases.

Expose the legacy cgroup control file as memory.zram_reclaim with the
format "<enable> <ratio>". Enabling without a ratio uses the default
ratio; ratio changes while enabled are rejected so reclaim hot paths see
a stable threshold. Unsupported platforms keep the file visible but
return -EOPNOTSUPP on writes.

Signed-off-by: Ze Zuo <zuoze1@huawei.com>
Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
---
 include/linux/memcontrol.h   |   7 +
 include/linux/zram_reclaim.h |  29 +++
 mm/Kconfig                   |  23 ++
 mm/Makefile                  |   1 +
 mm/memcontrol.c              |   6 +
 mm/zram_reclaim.c            | 411 +++++++++++++++++++++++++++++++++++
 6 files changed, 477 insertions(+)
 create mode 100644 include/linux/zram_reclaim.h
 create mode 100644 mm/zram_reclaim.c

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9e4d2fbc3012..c22fab3596f7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -346,6 +346,13 @@ struct mem_cgroup_extension {
 	spinlock_t split_queue_lock;
 	struct list_head split_queue;
 	unsigned long split_queue_len;
+#ifdef CONFIG_ZRAM_RECLAIM
+	u32 zram_reclaim_state;		/* packed: bit0=enabled, bit1=bank */
+	unsigned int zram_limit_ratio;	/* swap usage ratio limit (0..60) */
+	struct mutex zram_reclaim_lock;	/* serializes cgroup show/write */
+	atomic_long_t anon_cost[2];	/* per-bank anon refault cost */
+	atomic_long_t file_cost[2];	/* per-bank file refault cost */
+#endif
 	struct mem_cgroup memcg;
 };
 
diff --git a/include/linux/zram_reclaim.h b/include/linux/zram_reclaim.h
new file mode 100644
index 000000000000..ec19406d201f
--- /dev/null
+++ b/include/linux/zram_reclaim.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ZRAM_RECLAIM_H
+#define _LINUX_ZRAM_RECLAIM_H
+
+#include <linux/memcontrol.h>
+
+#ifdef CONFIG_ZRAM_RECLAIM
+void zram_reclaim_refault_cost(struct mem_cgroup *memcg,
+			       struct lruvec *lruvec, bool file,
+			       unsigned long nr_pages);
+
+bool zram_reclaim_should_use_policy(struct lruvec *lruvec,
+				    struct mem_cgroup *memcg,
+				    signed char priority,
+				    int *swappiness);
+#else
+static inline void zram_reclaim_refault_cost(struct mem_cgroup *memcg,
+					     struct lruvec *lruvec,
+					     bool file,
+					     unsigned long nr_pages)
+{ }
+
+static inline bool zram_reclaim_should_use_policy(struct lruvec *lruvec,
+						  struct mem_cgroup *memcg,
+						  signed char priority,
+						  int *swappiness)
+{ return false; }
+#endif
+#endif /* _LINUX_ZRAM_RECLAIM_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index e607d1576140..5164677ca73d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -528,6 +528,29 @@ config MEMCG_QOS
 
 	  If unsure, say "n".
 
+config ZRAM_RECLAIM
+	bool "Optimized Reclaim Policy for ZRAM Environments"
+	depends on MEMCG_SWAP
+	help
+	  Enable advanced memory reclaim strategy for ZRAM + HDD environments.
+	  Isolates the custom policy logic into a separate module, supporting
+	  CPU-based auto-tuning (e.g., HISI) and proactive anon reclaim.
+
+	  The current auto-tuned implementation is supported only on ARM64
+	  Hisilicon systems recognized by zram_reclaim_init(). On other
+	  platforms the legacy memcg control file is still visible, but
+	  writes fail with -EOPNOTSUPP because no default weight is provided.
+
+	  This policy assumes anonymous swapins are primarily served by zram,
+	  while file refaults represent a slower backing-storage path. It is
+	  intended for swap topologies where that latency split is true.
+
+	  The legacy memcg control file is memory.zram_reclaim and uses the
+	  format "<enable> <ratio>". Ratio changes require disabling the
+	  feature first with "0 0", so reclaim paths run with a stable
+	  threshold. A disabled -> enabled transition starts a fresh
+	  accounting round.
+
 config CMA
 	bool "Contiguous Memory Allocator"
 	depends on HAVE_MEMBLOCK && MMU
diff --git a/mm/Makefile b/mm/Makefile
index 7f19e97ce466..bb4646ad9069 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -111,3 +111,4 @@ obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o
 obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o
 obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o
 obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
+obj-$(CONFIG_ZRAM_RECLAIM) += zram_reclaim.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b9d083303cf6..7e25b45ca1b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5320,6 +5320,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	struct mem_cgroup_extension *memcg_ext;
 
 	memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg);
+#ifdef CONFIG_ZRAM_RECLAIM
+	mutex_destroy(&memcg_ext->zram_reclaim_lock);
+#endif
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
 	free_percpu(memcg->stat_cpu);
@@ -5355,6 +5358,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	if (!memcg_ext)
 		return ERR_PTR(error);
 
+#ifdef CONFIG_ZRAM_RECLAIM
+	mutex_init(&memcg_ext->zram_reclaim_lock);
+#endif
 	memcg = &memcg_ext->memcg;
 	memcg->id.id = mem_cgroup_alloc_id();
 	if (memcg->id.id < 0) {
diff --git a/mm/zram_reclaim.c b/mm/zram_reclaim.c
new file mode 100644
index 000000000000..b7c2d490d339
--- /dev/null
+++ b/mm/zram_reclaim.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bits.h>
+#include <linux/mutex.h>
+#include <linux/overflow.h>
+#include <linux/swap.h>
+#include <linux/zram_reclaim.h>
+
+#ifdef CONFIG_ARM64
+#include <asm/cputype.h>
+#endif
+
+#define ZRAM_PRIORITY_DELTA		10
+#define ZRAM_PRIORITY_FLOOR		(DEF_PRIORITY - ZRAM_PRIORITY_DELTA)
+#define ZRAM_LIMIT_RATIO_DEFAULT 30
+#define ZRAM_ABS_COST_MIN		5000UL
+#define ZRAM_ABS_COST_SHIFT		10
+#define ZRAM_OVERLOAD_SWAPPINESS	150
+#define ZRAM_RECLAIM_STATE_ENABLED	BIT(0)
+#define ZRAM_RECLAIM_STATE_BANK		BIT(1)
+
+static unsigned int zram_file_weight_default;
+
+static inline bool zram_reclaim_state_enabled(u32 state)
+{
+	return state & ZRAM_RECLAIM_STATE_ENABLED;
+}
+
+static inline unsigned int zram_reclaim_state_bank(u32 state)
+{
+	return !!(state & ZRAM_RECLAIM_STATE_BANK);
+}
+
+static unsigned long zram_reclaim_read_counter(atomic_long_t *counter)
+{
+	long val = atomic_long_read(counter);
+
+	return val > 0 ? val : 0;
+}
+
+static void zram_reclaim_decay_counter(atomic_long_t *counter,
+				       unsigned long decay)
+{
+	long old, new;
+
+	if (!decay)
+		return;
+
+	do {
+		old = atomic_long_read(counter);
+		if (old <= 0)
+			return;
+
+		new = old > decay ? old - decay : 0;
+	} while (atomic_long_cmpxchg(counter, old, new) != old);
+}
+
+static bool zram_reclaim_anon_overloaded(unsigned long a_cost,
+					 unsigned long f_cost,
+					 unsigned long abs_threshold,
+					 unsigned int weight)
+{
+	u64 weighted_file;
+	u64 threshold;
+
+	/*
+	 * The counters should decay far below overflow ranges in practice,
+	 * but keep both checks so future weight/accounting changes cannot
+	 * wrap the overload threshold.
+	 */
+	if (a_cost <= abs_threshold ||
+	    check_mul_overflow((u64)f_cost, (u64)weight, &weighted_file) ||
+	    check_add_overflow(weighted_file, 1ULL, &threshold))
+		return false;
+	return (u64)a_cost > threshold;
+}
+
+/*
+ * zram_reclaim_refault_cost - record IO reclaim cost and decay historical
+ * data dynamically.
+ * @memcg: The mem_cgroup to update.
+ * @file: True for file refault cost, false for anonymous swapin cost.
+ * @nr_pages: Cost units counted in pages/events, not time.
+ *
+ * Caller must ensure @memcg and @lruvec remain valid for the duration of
+ * the call. Some call sites obtain @memcg under RCU, so this function must
+ * not sleep.
+ */
+void zram_reclaim_refault_cost(struct mem_cgroup *memcg,
+			       struct lruvec *lruvec, bool file,
+			       unsigned long nr_pages)
+{
+	unsigned long a, f, a_decay, f_decay, total, sum, threshold;
+	unsigned int bank;
+	u32 state;
+	struct mem_cgroup_extension *memcg_ext;
+
+	if (!memcg)
+		return;
+
+	memcg_ext = to_memcg_ext(memcg);
+	/* Acquire: pair with write-path smp_store_release so that
+	 * enabled/bank/ratio are observed as a consistent snapshot.
+	 */
+	state = smp_load_acquire(&memcg_ext->zram_reclaim_state);
+	if (!zram_reclaim_state_enabled(state))
+		return;
+
+	bank = zram_reclaim_state_bank(state);
+
+	if (file)
+		atomic_long_add(nr_pages, &memcg_ext->file_cost[bank]);
+	else
+		atomic_long_add(nr_pages, &memcg_ext->anon_cost[bank]);
+
+	a = zram_reclaim_read_counter(&memcg_ext->anon_cost[bank]);
+	f = zram_reclaim_read_counter(&memcg_ext->file_cost[bank]);
+
+	total = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
+	threshold = total >> 2;
+
+	if (check_add_overflow(a, f, &sum))
+		sum = ULONG_MAX;
+	if (sum > threshold) {
+		a_decay = a >> 1;
+		f_decay = f >> 1;
+		zram_reclaim_decay_counter(&memcg_ext->anon_cost[bank],
+					   a_decay);
+		zram_reclaim_decay_counter(&memcg_ext->file_cost[bank],
+					   f_decay);
+	}
+}
+
+bool zram_reclaim_should_use_policy(struct lruvec *lruvec,
+				    struct mem_cgroup *memcg,
+				    signed char priority,
+				    int *swappiness)
+{
+	bool zram_overloaded;
+	struct mem_cgroup_extension *memcg_ext;
+	unsigned int limit, weight = zram_file_weight_default;
+	unsigned long abs_threshold;
+	unsigned int bank;
+	unsigned long total_anon, total_file, a_cost, f_cost;
+	unsigned long nr_mem, nr_swap_total, nr_swap, total;
+	u32 state;
+
+	if (!memcg || !weight)
+		return false;
+
+	memcg_ext = to_memcg_ext(memcg);
+	/* Acquire: pair with write-path smp_store_release so that
+	 * enabled/bank/ratio are observed as a consistent snapshot.
+	 */
+	state = smp_load_acquire(&memcg_ext->zram_reclaim_state);
+	if (!zram_reclaim_state_enabled(state))
+		return false;
+
+	bank = zram_reclaim_state_bank(state);
+
+	if (!current_is_kswapd() && priority < ZRAM_PRIORITY_FLOOR)
+		return false;
+
+	a_cost = zram_reclaim_read_counter(&memcg_ext->anon_cost[bank]);
+	f_cost = zram_reclaim_read_counter(&memcg_ext->file_cost[bank]);
+
+	/*
+	 * Workload Classifier: Avoid forcing ANON reclaim when anon is scarce
+	 * (total_anon < total_file / 2). Prevents this cgroup from becoming
+	 * a "memory black hole" that starves other cgroups of reclaim pressure.
+	 */
+	total_anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		     lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+	total_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+		     lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
+
+	/*
+	 * Cost threshold: 0.1% of total_anon (min 5000). For 400GB cgroup,
+	 * this is ~3s ZRAM latency, enough to absorb concurrency spikes.
+	 */
+	abs_threshold = max(ZRAM_ABS_COST_MIN,
+			    total_anon >> ZRAM_ABS_COST_SHIFT);
+
+	/* Skip aggressive strategy for Cache-heavy workloads */
+	if (total_anon < total_file / 2)
+		return false;
+
+	/*
+	 * Check ZRAM usage ratio limit before taking the aggressive
+	 * SCAN_ANON fast path.
+	 *
+	 * For cgroups with swap accounting enabled, use the actual swap
+	 * footprint when it exists:
+	 *   nr_swap = memsw - memory
+	 *   swap_ratio = nr_swap / (nr_swap + total_anon) * 100
+	 *
+	 * The current implementation falls back to the default reclaim
+	 * balance whenever there is no charged swap yet, whenever the
+	 * limit is unset, or whenever swap_ratio exceeds
+	 * zram_limit_ratio.
+	 */
+	if (do_swap_account) {
+		limit = READ_ONCE(memcg_ext->zram_limit_ratio);
+
+		if (!limit)
+			return false;
+
+		nr_mem = page_counter_read(&memcg->memory);
+		nr_swap_total = page_counter_read(&memcg->memsw);
+
+		if (nr_swap_total <= nr_mem)
+			return false;
+
+		nr_swap = nr_swap_total - nr_mem;
+		total = nr_swap + total_anon;
+
+		if (total && (u64)nr_swap * 100 > (u64)limit * total)
+			return false;
+	}
+
+	/*
+	 * Double Gate: Force SCAN_ANON only when both conditions hold:
+	 * 1. Absolute overload: anon_cost > 0.1% of total_anon (abs_thresh)
+	 * 2. Relative pain: anon_cost > file_cost * weight (ZRAM worse
+	 * than HDD)
+	 */
+	zram_overloaded = zram_reclaim_anon_overloaded(a_cost, f_cost,
+						       abs_threshold, weight);
+
+	if (!zram_overloaded)
+		return true;
+
+	*swappiness = ZRAM_OVERLOAD_SWAPPINESS;
+
+	return false;
+}
+
+/*
+ * memory.zram_reclaim ABI:
+ *   "<enable> <ratio>\n"
+ *
+ * Keep the format stable for both show() and write(). While enabled, users
+ * must disable the feature before changing the ratio so reclaim paths observe
+ * a stable threshold configuration. Disable must be written as "0 0".
+ * A disabled -> enabled transition starts a fresh accounting round on the
+ * alternate cost bank.
+ */
+static int memory_zram_reclaim_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	struct mem_cgroup_extension *memcg_ext = to_memcg_ext(memcg);
+	u32 state;
+	unsigned int ratio;
+
+	/* Pairs with write-path smp_store_release(). */
+	state = smp_load_acquire(&memcg_ext->zram_reclaim_state);
+	ratio = READ_ONCE(memcg_ext->zram_limit_ratio);
+	seq_printf(m, "%u %u\n",
+		   zram_reclaim_state_enabled(state) ? 1 : 0, ratio);
+
+	return 0;
+}
+
+static ssize_t memory_zram_reclaim_write(struct kernfs_open_file *of,
+					 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	struct mem_cgroup_extension *memcg_ext = to_memcg_ext(memcg);
+	char *token;
+	bool ratio_specified = false;
+	unsigned long enable;
+	unsigned long ratio = 0;
+	u32 state, new_state;
+	unsigned int bank;
+	int ret;
+
+	if (!zram_file_weight_default)
+		return -EOPNOTSUPP;
+
+	/* Parse first token (enable) */
+	token = strsep(&buf, " \t");
+	if (!token)
+		return -EINVAL;
+
+	ret = kstrtoul(token, 0, &enable);
+	if (ret)
+		return ret;
+
+	if (enable != 0 && enable != 1)
+		return -EINVAL;
+
+	/* Parse second token (ratio) if exists */
+	if (buf) {
+		token = strstrip(buf);
+		if (token && *token) {
+			ratio_specified = true;
+			ret = kstrtoul(token, 0, &ratio);
+			if (ret)
+				return ret;
+			if (ratio > 60)
+				return -EINVAL;
+		}
+	}
+
+	mutex_lock(&memcg_ext->zram_reclaim_lock);
+	state = READ_ONCE(memcg_ext->zram_reclaim_state);
+	bank = zram_reclaim_state_bank(state);
+
+	/* Handle disable; the next enable starts a fresh accounting round. */
+	if (enable == 0) {
+		if (!ratio_specified || ratio) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		WRITE_ONCE(memcg_ext->zram_limit_ratio, 0);
+		/* Release: pair with read-path smp_load_acquire; readers
+		 * must see the disabled state before stale ratio.
+		 */
+		smp_store_release(&memcg_ext->zram_reclaim_state,
+				  bank ? ZRAM_RECLAIM_STATE_BANK : 0);
+		ret = nbytes;
+		goto out_unlock;
+	}
+
+	/* Handle enable */
+	if (!ratio_specified) {
+		ratio = ZRAM_LIMIT_RATIO_DEFAULT;
+	} else if (!ratio) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Dynamic update restriction:
+	 * Keep the ratio stable while enabled so reclaim hot paths only race
+	 * with on/off transitions, not threshold changes.
+	 */
+	if (zram_reclaim_state_enabled(state) &&
+	    ratio != READ_ONCE(memcg_ext->zram_limit_ratio)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	/*
+	 * Start a fresh accounting round only on disabled -> enabled.
+	 * Re-enabling flips to the other bank so readers never combine an
+	 * old enable decision with a new bank selection.
+	 */
+	if (!zram_reclaim_state_enabled(state)) {
+		bank ^= 1;
+		atomic_long_set(&memcg_ext->anon_cost[bank], 0);
+		atomic_long_set(&memcg_ext->file_cost[bank], 0);
+	}
+
+	WRITE_ONCE(memcg_ext->zram_limit_ratio, ratio);
+	new_state = ZRAM_RECLAIM_STATE_ENABLED;
+	if (bank)
+		new_state |= ZRAM_RECLAIM_STATE_BANK;
+	/* Release: pair with read-path smp_load_acquire; readers must
+	 * see the new enabled/bank state after observing the cleared
+	 * bank counters and the updated ratio.
+	 */
+	smp_store_release(&memcg_ext->zram_reclaim_state, new_state);
+
+	ret = nbytes;
+out_unlock:
+	mutex_unlock(&memcg_ext->zram_reclaim_lock);
+	return ret;
+}
+
+static int __init zram_reclaim_init(void)
+{
+#ifdef CONFIG_ARM64
+	u64 midr = read_cpuid_id();
+
+	/* Different CPU models have different compression and decompression
+	 * capabilities. The cost ratio between disk IO and ZRAM should be
+	 * tuned according to the CPU capability. The default values here are
+	 * empirical results derived from extensive experiments with this
+	 * scheme on different HiSilicon chips.
+	 */
+	if (MIDR_IMPLEMENTOR(midr) == ARM_CPU_IMP_HISI) {
+		u32 part = MIDR_PARTNUM(midr);
+
+		if (part == HISI_CPU_PART_TSV110)
+			zram_file_weight_default = 64;
+		else if (part >= HISI_CPU_PART_TSV200)
+			zram_file_weight_default = 100;
+	}
+#endif
+	return 0;
+}
+early_initcall(zram_reclaim_init);
+
+static int __init zram_reclaim_sysfs_init(void)
+{
+	static struct cftype zram_files[] = {
+		{
+			.name = "zram_reclaim",
+			.flags = CFTYPE_NOT_ON_ROOT,
+			.seq_show = memory_zram_reclaim_show,
+			.write = memory_zram_reclaim_write,
+		},
+		{ }	/* terminate */
+	};
+	return cgroup_add_legacy_cftypes(&memory_cgrp_subsys, zram_files);
+}
+subsys_initcall(zram_reclaim_sysfs_init);
-- 
2.25.1