[openEuler-1.0-LTS 1/2] mm: add ZRAM-Reclaim infrastructure and policy core

18 May 2026

Introduce CONFIG_ZRAM_RECLAIM, a new configurable reclaim module
optimized for ZRAM + HDD environments. This patch adds the core
infrastructure including Kconfig option, data structures, cost
tracking counters, and the main policy logic.

Data structure changes:
- Extend struct mem_cgroup with conditionally compiled fields:
  'more_zram' flag to indicate ZRAM-heavy workload
  'zram_limit_ratio' for future limit control
  'anon_cost' and 'file_cost' for tracking refault pressure

- struct mem_cgroup_per_node is not directly extended; instead,
  per-lruvec cost counters are stored in the existing lruvec structure
  through atomic_long_t fields. The nodeinfo array remains the last
  member of mem_cgroup for ABI compatibility.

Core policy logic:
- zram_reclaim_balance_scan(): called from get_scan_count() to
  dynamically adjust reclaim scan balance. When 'more_zram' is set,
  it checks if anonymous refault pressure exceeds thresholds:
    threshold = max(5000UL, total_anon >> 10)
    overloaded = (anon_cost > threshold) &&
                 (anon_cost > file_cost * weight + 1)
  If overloaded, swappiness is forced to 150 and function returns
  false to let normal scan proceed with elevated swappiness.
  Otherwise, returns true to skip balance logic.

Cost tracking (to be hooked in patch 2):
- zram_reclaim_refault_cost() records refault events for file pages
- Similar helper for anon pages called from do_swap_page
- Counters decay automatically: when anon_cost + file_cost exceeds
  total_lruvec_size >> 10, both counters are halved

CPU vendor auto-tuning:
- On ARM64 with Hisilicon CPUs, automatically initialize
  zram_file_weight_default based on CPU part number:
    TSV110: weight = 64
    TSV200+: weight = 100
- The weight controls the threshold for overload detection
- Uses early_initcall() to run before other mm initializations

The feature depends on MEMCG and adds no overhead when
CONFIG_ZRAM_RECLAIM is disabled.

Signed-off-by: Ze Zuo <zuoze1@huawei.com>
Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
---
 include/linux/memcontrol.h   |   6 +
 include/linux/zram_reclaim.h |  25 ++++
 mm/Kconfig                   |   9 ++
 mm/Makefile                  |   1 +
 mm/zram_reclaim.c            | 244 +++++++++++++++++++++++++++++++++++
 5 files changed, 285 insertions(+)
 create mode 100644 include/linux/zram_reclaim.h
 create mode 100644 mm/zram_reclaim.c

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9e4d2fbc3..9eabc7179 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -346,6 +346,12 @@ struct mem_cgroup_extension {
 	spinlock_t split_queue_lock;
 	struct list_head split_queue;
 	unsigned long split_queue_len;
+#ifdef CONFIG_ZRAM_RECLAIM
+	unsigned int zram_reclaim;
+	unsigned int zram_limit_ratio;
+	atomic_long_t anon_cost;
+	atomic_long_t file_cost;
+#endif
 	struct mem_cgroup memcg;
 };
 
diff --git a/include/linux/zram_reclaim.h b/include/linux/zram_reclaim.h
new file mode 100644
index 000000000..e820201b5
--- /dev/null
+++ b/include/linux/zram_reclaim.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ZRAM_RECLAIM_H
+#define _LINUX_ZRAM_RECLAIM_H
+
+#include <linux/memcontrol.h>
+
+#ifdef CONFIG_ZRAM_RECLAIM
+void zram_reclaim_refault_cost(struct mem_cgroup *memcg, struct lruvec *lruvec,
+				bool file, unsigned long nr_pages);
+
+bool zram_reclaim_scan_balance(struct lruvec *lruvec, struct mem_cgroup *memcg,
+				int *swappiness);
+#else
+static inline void zram_reclaim_refault_cost(struct mem_cgroup *memcg,
+						struct lruvec *lruvec,
+						bool file,
+						unsigned long nr_pages)
+{ }
+
+static inline bool zram_reclaim_scan_balance(struct lruvec *lruvec,
+						struct mem_cgroup *memcg,
+						int *swappiness)
+{ return false; }
+#endif
+#endif /* _LINUX_ZRAM_RECLAIM_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index e607d1576..71ff835a2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -528,6 +528,15 @@ config MEMCG_QOS
 
 	  If unsure, say "n".
 
+
+config ZRAM_RECLAIM
+	bool "Optimized Reclaim Policy for ZRAM Environments"
+	depends on MEMCG
+	help
+	  Enable advanced memory reclaim strategy for ZRAM + HDD environments.
+	  Isolates the custom policy logic into a separate module, supporting
+	  CPU-based auto-tuning (e.g., HISI) and proactive anon reclaim.
+
 config CMA
 	bool "Contiguous Memory Allocator"
 	depends on HAVE_MEMBLOCK && MMU
diff --git a/mm/Makefile b/mm/Makefile
index 7f19e97ce..bb4646ad9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -111,3 +111,4 @@ obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o
 obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o
 obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o
 obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
+obj-$(CONFIG_ZRAM_RECLAIM) += zram_reclaim.o
diff --git a/mm/zram_reclaim.c b/mm/zram_reclaim.c
new file mode 100644
index 000000000..215b734d0
--- /dev/null
+++ b/mm/zram_reclaim.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/zram_reclaim.h>
+
+static unsigned int zram_file_weight_default;
+static unsigned int zram_fallback_swappiness;
+
+/**
+ * zram_reclaim_refault_cost - Record IO reclaim cost and decay historical
+ * data dynamically.
+ * @mem_cgroup: The mem_cgroup to update.
+ * @file: True if file page cost, false if anon page cost.
+ * @nr_pages: Number of pages involved in the cost event.
+ */
+void zram_reclaim_refault_cost(struct mem_cgroup *memcg, struct lruvec *lruvec,
+				bool file, unsigned long nr_pages)
+{
+	unsigned long a, f, total;
+	struct mem_cgroup_extension *memcg_ext;
+
+	if (!memcg)
+		return;
+
+	memcg_ext = to_memcg_ext(memcg);
+	if (!memcg_ext->zram_reclaim)
+		return;
+
+	if (file)
+		atomic_long_add(nr_pages, &memcg_ext->file_cost);
+	else
+		atomic_long_add(nr_pages, &memcg_ext->anon_cost);
+
+	a = atomic_long_read(&memcg_ext->anon_cost);
+	f = atomic_long_read(&memcg_ext->file_cost);
+
+	total = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
+
+	if (a + f > (total >> 2)) {
+		atomic_long_sub(a >> 1, &memcg_ext->anon_cost);
+		atomic_long_sub(f >> 1, &memcg_ext->file_cost);
+	}
+}
+
+bool zram_reclaim_scan_balance(struct lruvec *lruvec,
+				struct mem_cgroup *memcg,
+				int *swappiness)
+{
+	bool zram_overloaded;
+	unsigned int abs, weight = zram_file_weight_default;
+	struct mem_cgroup_extension *memcg_ext;
+	unsigned long total_anon, total_file, a_cost, f_cost;
+
+	if (!memcg || !weight)
+		return false;
+
+	memcg_ext = to_memcg_ext(memcg);
+	if (!memcg_ext->zram_reclaim)
+		return false;
+
+	a_cost = atomic_long_read(&memcg_ext->anon_cost);
+	f_cost = atomic_long_read(&memcg_ext->file_cost);
+
+	/*
+	 * Workload Classifier: Avoid forcing ANON reclaim when anon is scarce
+	 * (total_anon < total_file / 2). Prevents this cgroup from becoming
+	 * a "memory black hole" that starves other cgroups of reclaim pressure.
+	 */
+	total_anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		     lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+	total_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+		     lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
+
+	/*
+	 * Cost threshold: 0.1% of total_anon (min 5000). For 400GB cgroup,
+	 * this is ~3s ZRAM latency, enough to absorb concurrency spikes.
+	 */
+	abs = max(5000UL, total_anon >> 10);
+
+	/* Skip aggressive strategy for Cache-heavy workloads */
+	if (total_anon * 2 < total_file)
+		return false;
+
+	/*
+	 * Check ZRAM usage ratio limit to prevent ZRAM exhaustion.
+	 *
+	 * For cgroups with swap accounting enabled, calculate the actual
+	 * swap usage:
+	 *   nr_swap = memsw - memory
+	 *   swap_ratio = nr_swap / (nr_swap + total_anon) * 100
+	 *
+	 * If swap_ratio exceeds zram_limit_ratio, fallback to default balance
+	 * (stop aggressive SCAN_ANON).
+	 */
+	if (do_swap_account) {
+		unsigned int limit = READ_ONCE(memcg_ext->zram_limit_ratio);
+
+		if (!limit)
+			return false;
+
+		unsigned long nr_mem = page_counter_read(&memcg->memory);
+		unsigned long nr_swap_total = page_counter_read(&memcg->memsw);
+
+		if (nr_swap_total <= nr_mem)
+			return false;
+
+		unsigned long nr_swap = nr_swap_total - nr_mem;
+		unsigned long total = nr_swap + total_anon;
+
+		if (total && (u64)nr_swap * 100 > (u64)limit * total)
+			return false;
+	}
+
+	/*
+	 * Double Gate: Force SCAN_ANON only when both conditions hold:
+	 * 1. Absolute overload: anon_cost > 0.1% of total_anon (abs_thresh)
+	 * 2. Relative pain: anon_cost > file_cost * weight (ZRAM worse
+	 * than HDD)
+	 */
+	zram_overloaded = (a_cost > abs) && (a_cost > f_cost * weight + 1);
+
+	if (!zram_overloaded)
+		return true;
+
+	*swappiness = zram_fallback_swappiness ?: 150;
+
+	return false;
+}
+
+static int memory_zram_reclaim_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	struct mem_cgroup_extension *memcg_ext = to_memcg_ext(memcg);
+
+	if (memcg_ext->zram_reclaim)
+		seq_printf(m, "1 %u\n", memcg_ext->zram_limit_ratio);
+	else
+		seq_printf(m, "%u\n", memcg_ext->zram_limit_ratio);
+	return 0;
+}
+
+static ssize_t memory_zram_reclaim_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	struct mem_cgroup_extension *memcg_ext = to_memcg_ext(memcg);
+	char *token;
+	unsigned long enable;
+	unsigned long ratio = 0;
+	int ret;
+
+	if (!zram_file_weight_default)
+		return -EOPNOTSUPP;
+
+	/* Parse first token (enable) */
+	token = strsep(&buf, " \t");
+	if (!token)
+		return -EINVAL;
+
+	ret = kstrtoul(token, 0, &enable);
+	if (ret)
+		return ret;
+
+	if (enable != 0 && enable != 1)
+		return -EINVAL;
+
+	/* Parse second token (ratio) if exists */
+	if (buf) {
+		token = strstrip(buf);
+		if (token && *token) {
+			ret = kstrtoul(token, 0, &ratio);
+		if (ret)
+			return ret;
+		if (ratio == 0 || ratio > 100)
+			return -EINVAL;
+		}
+	}
+
+	/* Handle disable */
+	if (enable == 0) {
+		memcg_ext->zram_reclaim = 0;
+		return nbytes;
+	}
+
+	/* Handle enable */
+	if (ratio == 0) {
+		/* echo 1 (no ratio specified): use default 30 */
+		ratio = 30;
+	}
+
+	/*
+	 * Dynamic update restriction:
+	 * Cannot change ratio while already enabled
+	 */
+	if (memcg_ext->zram_reclaim && ratio != memcg_ext->zram_limit_ratio) {
+		pr_warn("Disable zram_reclaim first (write 0) to change limit ratio.\n");
+		return -EBUSY;
+	}
+
+	/* Apply new settings */
+	memcg_ext->zram_reclaim = 1;
+	memcg_ext->zram_limit_ratio = ratio;
+
+	return nbytes;
+}
+
+static int __init zram_reclaim_sysfs_init(void)
+{
+	static struct cftype zram_files[] = {
+	{
+		.name = "zram_reclaim",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_zram_reclaim_show,
+		.write = memory_zram_reclaim_write,
+	},
+		{ },
+	};
+	return cgroup_add_legacy_cftypes(&memory_cgrp_subsys, zram_files);
+}
+subsys_initcall(zram_reclaim_sysfs_init);
+
+static int __init zram_reclaim_init(void)
+{
+#ifdef CONFIG_ARM64
+	u64 midr = read_cpuid_id();
+
+	/* Based on latency measurements (HDD ~10ms vs ZRAM ~3-10us),
+	 * one HDD Refault costs roughly 100 times as much as a ZRAM SwapIn.
+	 * Update the default penalty factor to better reflect this gap and
+	 * prevent the strategy from giving up on ZRAM protection too early.
+	 */
+	if (MIDR_IMPLEMENTOR(midr) == ARM_CPU_IMP_HISI) {
+		u32 part = MIDR_PARTNUM(midr);
+
+		if (part == HISI_CPU_PART_TSV110)
+			zram_file_weight_default = 64;
+		else if (part >= HISI_CPU_PART_TSV200)
+			zram_file_weight_default = 100;
+	}
+#endif
+	return 0;
+}
+early_initcall(zram_reclaim_init);
-- 
2.33.0