From: Yu Liao liaoyu15@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
This patch add sysctl to clear pages in free lists of each NUMA node. For each NUMA node, clear each page in the free list, these work is scheduled on a random CPU of the NUMA node.
When kasan is enabled and the pages are free, the shadow memory will be filled with 0xFF, writing these free pages will cause UAF, so just disable KASAN for clear freelist.
Signed-off-by: Yu Liao liaoyu15@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/sysctl/vm.txt | 13 +++ mm/Kconfig | 7 ++ mm/Makefile | 2 + mm/clear_freelist_page.c | 163 ++++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+) create mode 100644 mm/clear_freelist_page.c
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 7d73882e2c273..8d824892d00d6 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -20,6 +20,7 @@ Currently, these files are in /proc/sys/vm:
- admin_reserve_kbytes - block_dump +- clear_freelist_pages - compact_memory - compact_unevictable_allowed - dirty_background_bytes @@ -104,6 +105,18 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
==============================================================
+clear_freelist_pages + +Available only when CONFIG_CLEAR_FREELIST_PAGE is set. When 1 is written to the +file, all pages in free lists will be written with 0. + +Zone lock is held during clear_freelist_pages, if the execution time is too +long, RCU CPU Stall warnings will be print. For each NUMA node, +clear_freelist_pages is performed on a "random" CPU of the NUMA node. +The time consuming is related to the hardware. + +============================================================== + compact_memory
Available only when CONFIG_COMPACTION is set. When 1 is written to the file, diff --git a/mm/Kconfig b/mm/Kconfig index 80d7b47ca9f53..3a38eb4a6f020 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -849,4 +849,11 @@ config MEMORY_RELIABLE To enable this function, mirrored memory is needed and "kernelcore=reliable" need to be added in kernel parameters.
+config CLEAR_FREELIST_PAGE + bool "Support for clear free list pages" + depends on MMU && SYSCTL + default n + help + Say y here to enable the clear free list pages feature. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 741f9c250914c..38291476ce222 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -6,6 +6,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n +KASAN_SANITIZE_clear_freelist_page.o := n
# These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of @@ -110,3 +111,4 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o +obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c new file mode 100644 index 0000000000000..69975f458dc79 --- /dev/null +++ b/mm/clear_freelist_page.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for clear free list pages. + */ + +#include <linux/mmzone.h> +#include <linux/mm_types.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/nmi.h> +#include <linux/module.h> + +#define for_each_populated_zone_pgdat(pgdat, zone) \ + for (zone = pgdat->node_zones; \ + zone; \ + zone = next_pgdat_zone(zone)) \ + if (!populated_zone(zone)) \ + ; /* do nothing */ \ + else + +struct pgdat_entry { + struct pglist_data *pgdat; + struct work_struct work; +}; + +static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait); +static DEFINE_MUTEX(clear_freelist_lock); +static atomic_t clear_freelist_workers; +static atomic_t clear_pages_num; +static int one = 1; + +/* + * next_pgdat_zone - helper magic for for_each_populated_zone_pgdat() + */ +static struct zone *next_pgdat_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else + zone = NULL; + return zone; +} + +static void clear_pgdat_freelist_pages(struct work_struct *work) +{ + struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work); + struct pglist_data *pgdat = entry->pgdat; + unsigned long flags, order, t; + struct page *page; + struct zone *zone; + + for_each_populated_zone_pgdat(pgdat, zone) { + spin_lock_irqsave(&zone->lock, flags); + for_each_migratetype_order(order, t) { + list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) { +#ifdef CONFIG_KMAP_LOCAL + int i; + + /* Clear highmem by clear_highpage() */ + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +#else + memset(page_address(page), 0, (1 << order) * PAGE_SIZE); +#endif + touch_nmi_watchdog(); + atomic_add(1 << order, &clear_pages_num); + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + cond_resched(); + } + kfree(entry); + + if (atomic_dec_and_test(&clear_freelist_workers)) + wake_up(&clear_freelist_wait); +} + +static void init_clear_freelist_work(struct pglist_data *pgdat) +{ + struct pgdat_entry *entry; + + entry = kzalloc(sizeof(struct pgdat_entry), GFP_KERNEL); + if (!entry) + return; + + entry->pgdat = pgdat; + INIT_WORK(&entry->work, clear_pgdat_freelist_pages); + queue_work_node(pgdat->node_id, system_unbound_wq, &entry->work); +} + +static void clear_freelist_pages(void) +{ + struct pglist_data *pgdat; + + mutex_lock(&clear_freelist_lock); + drain_all_pages(NULL); + + for_each_online_pgdat(pgdat) { + atomic_inc(&clear_freelist_workers); + init_clear_freelist_work(pgdat); + } + + wait_event(clear_freelist_wait, atomic_read(&clear_freelist_workers) == 0); + + pr_debug("Cleared pages %d\nFree pages %lu\n", atomic_read(&clear_pages_num), + global_zone_page_state(NR_FREE_PAGES)); + atomic_set(&clear_pages_num, 0); + + mutex_unlock(&clear_freelist_lock); +} + +static int sysctl_clear_freelist_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int val; + + table->data = &val; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) + clear_freelist_pages(); + + return ret; +} + +static struct ctl_table clear_freelist_table[] = { + { + .procname = "clear_freelist_pages", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &sysctl_clear_freelist_handler, + .extra1 = &one, + .extra2 = &one, + }, + { } +}; + +static struct ctl_table sys_ctl_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = clear_freelist_table, + }, + { } +}; + +static int __init clear_freelist_init(void) +{ + register_sysctl_table(sys_ctl_table); + + return 0; +} +module_init(clear_freelist_init);