From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.0-rc1 commit f29d8e9c0191a2a02500945db505e5c89159c3f4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Userspace should always be in charge of how to online memory and if memory should be onlined automatically in the kernel. Let's drop the parameter to overwrite this - XEN passes memhp_auto_online, just like add_memory(), so we can directly use that instead internally.
Link: http://lkml.kernel.org/r/20181123123740.27652-1-david@redhat.com Signed-off-by: David Hildenbrand david@redhat.com Acked-by: Michal Hocko mhocko@suse.com Reviewed-by: Oscar Salvador osalvador@suse.de Acked-by: Juergen Gross jgross@suse.com Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: Dan Williams dan.j.williams@intel.com Cc: Pavel Tatashin pasha.tatashin@oracle.com Cc: David Hildenbrand david@redhat.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Arun KS arunks@codeaurora.org Cc: Mathieu Malaterre malat@debian.org Cc: Stephen Rothwell sfr@canb.auug.org.au Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/xen/balloon.c | 2 +- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 7703ac47062fa..e692dcf576d66 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -352,7 +352,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource, memhp_auto_online); + rc = add_memory_resource(nid, resource); unlock_device_hotplug(); mutex_lock(&balloon_mutex);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8782f0e993704..9d28fca5bbde8 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -326,7 +326,7 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); -extern int add_memory_resource(int nid, struct resource *resource, bool online); +extern int add_memory_resource(int nid, struct resource *resource); extern int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1e20ad038496d..f13144a65b127 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1051,7 +1051,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, bool online) +int __ref add_memory_resource(int nid, struct resource *res) { u64 start, size; bool new_node = false; @@ -1114,7 +1114,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) mem_hotplug_done();
/* online pages if requested */ - if (online) + if (memhp_auto_online) walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, online_memory_block);
@@ -1138,7 +1138,7 @@ int __ref __add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res);
- ret = add_memory_resource(nid, res, memhp_auto_online); + ret = add_memory_resource(nid, res); if (ret < 0) release_memory_resource(res); return ret;
From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.0-rc1 commit 3f8e9178538189215b59f726f2449a08362e7074 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Let's use the easier to read (and not mess up) variants: - Use DEVICE_ATTR_RO - Use DEVICE_ATTR_WO - Use DEVICE_ATTR_RW instead of the more generic DEVICE_ATTR() we're using right now.
We have to rename most callback functions. By fixing the intendations we can even save some LOCs.
Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Ingo Molnar mingo@kernel.org Cc: Pavel Tatashin pasha.tatashin@oracle.com Cc: Oscar Salvador osalvador@suse.com Cc: Michal Hocko mhocko@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Signed-off-by: David Hildenbrand david@redhat.com Reviewed-by: Wei Yang richard.weiyang@gmail.com Reviewed-by: Oscar Salvador osalvador@suse.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/memory.c | 79 ++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 43 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index a4fbf5f5a11f5..35472a2b9d1a6 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -115,8 +115,8 @@ static unsigned long get_memory_block_size(void) * uses. */
-static ssize_t show_mem_start_phys_index(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t phys_index_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); unsigned long phys_index; @@ -128,8 +128,8 @@ static ssize_t show_mem_start_phys_index(struct device *dev, /* * Show whether the section of memory is likely to be hot-removable */ -static ssize_t show_mem_removable(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t removable_show(struct device *dev, struct device_attribute *attr, + char *buf) { unsigned long i, pfn; int ret = 1; @@ -152,8 +152,8 @@ static ssize_t show_mem_removable(struct device *dev, /* * online, offline, going offline, etc. */ -static ssize_t show_mem_state(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t state_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct memory_block *mem = to_memory_block(dev); ssize_t len = 0; @@ -293,7 +293,7 @@ static int memory_subsys_online(struct device *dev) return 0;
/* - * If we are called from store_mem_state(), online_type will be + * If we are called from state_store(), online_type will be * set >= 0 Otherwise we were called from the device online * attribute and need to set the online_type. */ @@ -322,9 +322,8 @@ static int memory_subsys_offline(struct device *dev) return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); }
-static ssize_t -store_mem_state(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) +static ssize_t state_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct memory_block *mem = to_memory_block(dev); int ret, online_type; @@ -381,7 +380,7 @@ store_mem_state(struct device *dev, * s.t. if I offline all of these sections I can then * remove the physical device? */ -static ssize_t show_phys_device(struct device *dev, +static ssize_t phys_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); @@ -402,7 +401,7 @@ static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn, } }
-static ssize_t show_valid_zones(struct device *dev, +static ssize_t valid_zones_show(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); @@ -442,33 +441,31 @@ static ssize_t show_valid_zones(struct device *dev,
return strlen(buf); } -static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); +static DEVICE_ATTR_RO(valid_zones); #endif
-static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); -static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); -static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); -static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); +static DEVICE_ATTR_RO(phys_index); +static DEVICE_ATTR_RW(state); +static DEVICE_ATTR_RO(phys_device); +static DEVICE_ATTR_RO(removable);
/* * Block size attribute stuff */ -static ssize_t -print_block_size(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t block_size_bytes_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%lx\n", get_memory_block_size()); }
-static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); +static DEVICE_ATTR_RO(block_size_bytes);
/* * Memory auto online policy. */
-static ssize_t -show_auto_online_blocks(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t auto_online_blocks_show(struct device *dev, + struct device_attribute *attr, char *buf) { if (memhp_auto_online) return sprintf(buf, "online\n"); @@ -476,9 +473,9 @@ show_auto_online_blocks(struct device *dev, struct device_attribute *attr, return sprintf(buf, "offline\n"); }
-static ssize_t -store_auto_online_blocks(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t auto_online_blocks_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { if (sysfs_streq(buf, "online")) memhp_auto_online = true; @@ -490,8 +487,7 @@ store_auto_online_blocks(struct device *dev, struct device_attribute *attr, return count; }
-static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks, - store_auto_online_blocks); +static DEVICE_ATTR_RW(auto_online_blocks);
/* * Some architectures will have custom drivers to do this, and @@ -500,9 +496,8 @@ static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks, * and will require this interface. */ #ifdef CONFIG_ARCH_MEMORY_PROBE -static ssize_t -memory_probe_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t probe_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { u64 phys_addr; int nid, ret; @@ -532,7 +527,7 @@ memory_probe_store(struct device *dev, struct device_attribute *attr, return ret; }
-static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); +static DEVICE_ATTR_WO(probe); #endif
#ifdef CONFIG_MEMORY_FAILURE @@ -541,10 +536,9 @@ static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); */
/* Soft offline a page */ -static ssize_t -store_soft_offline_page(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t soft_offline_page_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { int ret; u64 pfn; @@ -563,10 +557,9 @@ store_soft_offline_page(struct device *dev, }
/* Forcibly offline a page, including killing processes. */ -static ssize_t -store_hard_offline_page(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t hard_offline_page_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { int ret; u64 pfn; @@ -579,8 +572,8 @@ store_hard_offline_page(struct device *dev, return ret ? ret : count; }
-static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page); -static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page); +static DEVICE_ATTR_WO(soft_offline_page); +static DEVICE_ATTR_WO(hard_offline_page); #endif
/*
From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.7-rc1 commit 956f8b445061667c3545baa24778f890d1d522f4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Patch series "mm/memory_hotplug: allow to specify a default online_type", v3.
Distributions nowadays use udev rules ([1] [2]) to specify if and how to online hotplugged memory. The rules seem to get more complex with many special cases. Due to the various special cases, CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE cannot be used. All memory hotplug is handled via udev rules.
Every time we hotplug memory, the udev rule will come to the same conclusion. Especially Hyper-V (but also soon virtio-mem) add a lot of memory in separate memory blocks and wait for memory to get onlined by user space before continuing to add more memory blocks (to not add memory faster than it is getting onlined). This of course slows down the whole memory hotplug process.
To make the job of distributions easier and to avoid udev rules that get more and more complicated, let's extend the mechanism provided by - /sys/devices/system/memory/auto_online_blocks - "memhp_default_state=" on the kernel cmdline to be able to specify also "online_movable" as well as "online_kernel" Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com
=== Example /usr/libexec/config-memhotplug ===
#!/bin/bash
VIRT=`systemd-detect-virt --vm` ARCH=`uname -p`
sense_virtio_mem() { if [ -d "/sys/bus/virtio/drivers/virtio_mem/" ]; then DEVICES=`find /sys/bus/virtio/drivers/virtio_mem/ -maxdepth 1 -type l | wc -l` if [ $DEVICES != "0" ]; then return 0 fi fi return 1 }
if [ ! -e "/sys/devices/system/memory/auto_online_blocks" ]; then echo "Memory hotplug configuration support missing in the kernel" exit 1 fi
if grep "memhp_default_state=" /proc/cmdline > /dev/null; then echo "Memory hotplug configuration overridden in kernel cmdline (memhp_default_state=)" exit 1 fi
if [ $VIRT == "microsoft" ]; then echo "Detected Hyper-V on $ARCH" # Hyper-V wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif sense_virtio_mem; then echo "Detected virtio-mem on $ARCH" # virtio-mem wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif [ $ARCH == "s390x" ] || [ $ARCH == "s390" ]; then echo "Detected $ARCH" # standby memory should not be onlined automatically ONLINE_TYPE="offline" elif [ $ARCH == "ppc64" ] || [ $ARCH == "ppc64le" ]; then echo "Detected" $ARCH # PPC64 onlines all hotplugged memory right from the kernel ONLINE_TYPE="offline" elif [ $VIRT == "none" ]; then echo "Detected bare-metal on $ARCH" # Bare metal users expect hotplugged memory to be unpluggable. We assume # that ZONE imbalances on such enterpise servers cannot happen and is # properly documented ONLINE_TYPE="online_movable" else # TODO: Hypervisors that want to unplug DIMMs and can guarantee that ZONE # imbalances won't happen echo "Detected $VIRT on $ARCH" # Usually, ballooning is used in virtual environments, so memory should go to # ZONE_NORMAL. However, sometimes "movable_node" is relevant. ONLINE_TYPE="online" fi
echo "Selected online_type:" $ONLINE_TYPE
# Configure what to do with memory that will be hotplugged in the future echo $ONLINE_TYPE 2>/dev/null > /sys/devices/system/memory/auto_online_blocks if [ $? != "0" ]; then echo "Memory hotplug cannot be configured (e.g., old kernel or missing permissions)" # A backup udev rule should handle old kernels if necessary exit 1 fi
# Process all already pluggedd blocks (e.g., DIMMs, but also Hyper-V or virtio-mem) if [ $ONLINE_TYPE != "offline" ]; then for MEMORY in /sys/devices/system/memory/memory*; do STATE=`cat $MEMORY/state` if [ $STATE == "offline" ]; then echo $ONLINE_TYPE > $MEMORY/state fi done fi
=== Example /usr/lib/systemd/system/config-memhotplug.service ===
[Unit] Description=Configure memory hotplug behavior DefaultDependencies=no Conflicts=shutdown.target Before=sysinit.target shutdown.target After=systemd-modules-load.service ConditionPathExists=|/sys/devices/system/memory/auto_online_blocks
[Service] ExecStart=/usr/libexec/config-memhotplug Type=oneshot TimeoutSec=0 RemainAfterExit=yes
[Install] WantedBy=sysinit.target
=== Example modification to the 40-redhat.rules [2] ===
: diff --git a/40-redhat.rules b/40-redhat.rules-new : index 2c690e5..168fd03 100644 : --- a/40-redhat.rules : +++ b/40-redhat.rules-new : @@ -6,6 +6,9 @@ SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online} : # Memory hotadd request : SUBSYSTEM!="memory", GOTO="memory_hotplug_end" : ACTION!="add", GOTO="memory_hotplug_end" : +# memory hotplug behavior configured : +PROGRAM=="grep online /sys/devices/system/memory/auto_online_blocks", GOTO="memory_hotplug_end" : + : PROGRAM="/bin/uname -p", RESULT=="s390*", GOTO="memory_hotplug_end" : : ENV{.state}="online"
===
[1] https://github.com/lnykryn/systemd-rhel/pull/281 [2] https://github.com/lnykryn/systemd-rhel/blob/staging/rules/40-redhat.rules
This patch (of 8):
The name is misleading and it's not really clear what is "kept". Let's just name it like the online_type name we expose to user space ("online").
Add some documentation to the types.
Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Wei Yang richard.weiyang@gmail.com Reviewed-by: Baoquan He bhe@redhat.com Acked-by: Pankaj Gupta pankaj.gupta.linux@gmail.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Michal Hocko mhocko@kernel.org Cc: Oscar Salvador osalvador@suse.de Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Cc: Vitaly Kuznetsov vkuznets@redhat.com Cc: Yumei Huang yuhuang@redhat.com Cc: Igor Mammedov imammedo@redhat.com Cc: Eduardo Habkost ehabkost@redhat.com Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Haiyang Zhang haiyangz@microsoft.com Cc: K. Y. Srinivasan kys@microsoft.com Cc: Michael Ellerman mpe@ellerman.id.au (powerpc) Cc: Paul Mackerras paulus@samba.org Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Wei Liu wei.liu@kernel.org Link: http://lkml.kernel.org/r/20200319131221.14044-1-david@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-2-david@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/memory.c | 9 +++++---- include/linux/memory_hotplug.h | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 35472a2b9d1a6..74d9bf9dddaab 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -298,7 +298,7 @@ static int memory_subsys_online(struct device *dev) * attribute and need to set the online_type. */ if (mem->online_type < 0) - mem->online_type = MMOP_ONLINE_KEEP; + mem->online_type = MMOP_ONLINE;
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
@@ -337,7 +337,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, else if (sysfs_streq(buf, "online_movable")) online_type = MMOP_ONLINE_MOVABLE; else if (sysfs_streq(buf, "online")) - online_type = MMOP_ONLINE_KEEP; + online_type = MMOP_ONLINE; else if (sysfs_streq(buf, "offline")) online_type = MMOP_OFFLINE; else { @@ -348,7 +348,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: - case MMOP_ONLINE_KEEP: + case MMOP_ONLINE: /* mem->online_type is protected by device_hotplug_lock */ mem->online_type = online_type; ret = device_online(&mem->dev); @@ -429,7 +429,8 @@ static ssize_t valid_zones_show(struct device *dev, }
nid = mem->nid; - default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages); + default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, + nr_pages); strcat(buf, default_zone->name);
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 9d28fca5bbde8..09a05938576a3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -47,9 +47,13 @@ enum {
/* Types for control the zone type of onlined and offlined memory */ enum { + /* Offline the memory. */ MMOP_OFFLINE = -1, - MMOP_ONLINE_KEEP, + /* Online the memory. Zone depends, see default_zone_for_pfn(). */ + MMOP_ONLINE, + /* Online the memory to ZONE_NORMAL. */ MMOP_ONLINE_KERNEL, + /* Online the memory to ZONE_MOVABLE. */ MMOP_ONLINE_MOVABLE, };
From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.7-rc1 commit efc978ad0e05ed6401c7854811750bf55b67f4b9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Historically, we used the value -1. Just treat 0 as the special case now. Clarify a comment (which was wrong, when we come via device_online() the first time, the online_type would have been 0 / MEM_ONLINE). The default is now always MMOP_OFFLINE. This removes the last user of the manual "-1", which didn't use the enum value.
This is a preparation to use the online_type as an array index.
Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Wei Yang richard.weiyang@gmail.com Reviewed-by: Baoquan He bhe@redhat.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Pankaj Gupta pankaj.gupta.linux@gmail.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Oscar Salvador osalvador@suse.de Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Eduardo Habkost ehabkost@redhat.com Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Igor Mammedov imammedo@redhat.com Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Vitaly Kuznetsov vkuznets@redhat.com Cc: Wei Liu wei.liu@kernel.org Cc: Yumei Huang yuhuang@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-3-david@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/memory.c | 11 ++++------- include/linux/memory_hotplug.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 74d9bf9dddaab..5dff2c0d56b91 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -293,17 +293,14 @@ static int memory_subsys_online(struct device *dev) return 0;
/* - * If we are called from state_store(), online_type will be - * set >= 0 Otherwise we were called from the device online - * attribute and need to set the online_type. + * When called via device_online() without configuring the online_type, + * we want to default to MMOP_ONLINE. */ - if (mem->online_type < 0) + if (mem->online_type == MMOP_OFFLINE) mem->online_type = MMOP_ONLINE;
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); - - /* clear online_type */ - mem->online_type = -1; + mem->online_type = MMOP_OFFLINE;
return ret; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 09a05938576a3..dfb5d5b6f1063 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -48,7 +48,7 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { /* Offline the memory. */ - MMOP_OFFLINE = -1, + MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ MMOP_ONLINE, /* Online the memory to ZONE_NORMAL. */
From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.7-rc1 commit 4dc8207bfd45799525f882e1039e63e9438d605e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Let's use a simple array which we can reuse soon. While at it, move the string->mmop conversion out of the device hotplug lock.
Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Wei Yang richard.weiyang@gmail.com Reviewed-by: Baoquan He bhe@redhat.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Pankaj Gupta pankaj.gupta.linux@gmail.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Oscar Salvador osalvador@suse.de Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Eduardo Habkost ehabkost@redhat.com Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Igor Mammedov imammedo@redhat.com Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Vitaly Kuznetsov vkuznets@redhat.com Cc: Wei Liu wei.liu@kernel.org Cc: Yumei Huang yuhuang@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-4-david@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/memory.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5dff2c0d56b91..1b6f89fea15d4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -30,6 +30,24 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
#define MEMORY_CLASS_NAME "memory"
+static const char *const online_type_to_str[] = { + [MMOP_OFFLINE] = "offline", + [MMOP_ONLINE] = "online", + [MMOP_ONLINE_KERNEL] = "online_kernel", + [MMOP_ONLINE_MOVABLE] = "online_movable", +}; + +static int memhp_online_type_from_str(const char *str) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { + if (sysfs_streq(str, online_type_to_str[i])) + return i; + } + return -EINVAL; +} + #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
static int sections_per_block; @@ -322,26 +340,17 @@ static int memory_subsys_offline(struct device *dev) static ssize_t state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { + const int online_type = memhp_online_type_from_str(buf); struct memory_block *mem = to_memory_block(dev); - int ret, online_type; + int ret; + + if (online_type < 0) + return -EINVAL;
ret = lock_device_hotplug_sysfs(); if (ret) return ret;
- if (sysfs_streq(buf, "online_kernel")) - online_type = MMOP_ONLINE_KERNEL; - else if (sysfs_streq(buf, "online_movable")) - online_type = MMOP_ONLINE_MOVABLE; - else if (sysfs_streq(buf, "online")) - online_type = MMOP_ONLINE; - else if (sysfs_streq(buf, "offline")) - online_type = MMOP_OFFLINE; - else { - ret = -EINVAL; - goto err; - } - switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: @@ -357,7 +366,6 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, ret = -EINVAL; /* should never happen */ }
-err: unlock_device_hotplug();
if (ret < 0)
From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.7-rc1 commit bc58ebd506c369c26337cf6b1a400af1a25c989c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
We get the MEM_ONLINE notifier call if memory is added right from the kernel via add_memory() or later from user space.
Let's get rid of the "ha_waiting" flag - the wait event has an inbuilt mechanism (->done) for that. Initialize the wait event only once and reinitialize before adding memory. Unconditionally call complete() and wait_for_completion_timeout().
If there are no waiters, complete() will only increment ->done - which will be reset by reinit_completion(). If complete() has already been called, wait_for_completion_timeout() will not wait.
There is still the chance for a small race between concurrent reinit_completion() and complete(). If complete() wins, we would not wait - which is tolerable (and the race exists in current code as well).
Note: We only wait for "some" memory to get onlined, which seems to be good enough for now.
[akpm@linux-foundation.org: register_memory_notifier() after init_completion(), per David] Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Vitaly Kuznetsov vkuznets@redhat.com Reviewed-by: Baoquan He bhe@redhat.com Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Wei Liu wei.liu@kernel.org Cc: Oscar Salvador osalvador@suse.de Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Eduardo Habkost ehabkost@redhat.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Igor Mammedov imammedo@redhat.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: Yumei Huang yuhuang@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-6-david@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/hv/hv_balloon.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-)
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 6b782f2e93fc9..d1c9a6adf485c 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -541,7 +541,6 @@ struct hv_dynmem_device { * State to synchronize hot-add. */ struct completion ol_waitevent; - bool ha_waiting; /* * This thread handles hot-add * requests from the host as well as notifying @@ -642,10 +641,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, switch (val) { case MEM_ONLINE: case MEM_CANCEL_ONLINE: - if (dm_device.ha_waiting) { - dm_device.ha_waiting = false; - complete(&dm_device.ol_waitevent); - } + complete(&dm_device.ol_waitevent); break;
case MEM_OFFLINE: @@ -731,8 +727,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, has->covered_end_pfn += processed_pfn; spin_unlock_irqrestore(&dm_device.ha_lock, flags);
- init_completion(&dm_device.ol_waitevent); - dm_device.ha_waiting = !memhp_auto_online; + reinit_completion(&dm_device.ol_waitevent);
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), @@ -758,15 +753,14 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, }
/* - * Wait for the memory block to be onlined when memory onlining - * is done outside of kernel (memhp_auto_online). Since the hot - * add has succeeded, it is ok to proceed even if the pages in - * the hot added region have not been "onlined" within the - * allowed time. + * Wait for memory to get onlined. If the kernel onlined the + * memory when adding it, this will return directly. Otherwise, + * it will wait for user space to online the memory. This helps + * to avoid adding memory faster than it is getting onlined. As + * adding succeeded, it is ok to proceed even if the memory was + * not onlined in time. */ - if (dm_device.ha_waiting) - wait_for_completion_timeout(&dm_device.ol_waitevent, - 5*HZ); + wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); post_status(&dm_device); } } @@ -1611,6 +1605,7 @@ static int balloon_probe(struct hv_device *dev,
#ifdef CONFIG_MEMORY_HOTPLUG set_online_page_callback(&hv_online_page); + init_completion(&dm_device.ol_waitevent); register_memory_notifier(&hv_memory_nb); #endif
From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.7-rc1 commit 862919e568356cc36288a11b42cd88ec3a7100e9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
... and rename it to memhp_default_online_type. This is a preparation for more detailed default online behavior.
Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Wei Yang richard.weiyang@gmail.com Reviewed-by: Baoquan He bhe@redhat.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Pankaj Gupta pankaj.gupta.linux@gmail.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Oscar Salvador osalvador@suse.de Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Eduardo Habkost ehabkost@redhat.com Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Igor Mammedov imammedo@redhat.com Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Vitaly Kuznetsov vkuznets@redhat.com Cc: Wei Liu wei.liu@kernel.org Cc: Yumei Huang yuhuang@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-8-david@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org [Wupeng: keep memhp_auto_online for kabi] Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/memory.c | 10 ++++------ include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 9 ++++++--- 3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 1b6f89fea15d4..f7c41be57976b 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -473,10 +473,8 @@ static DEVICE_ATTR_RO(block_size_bytes); static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (memhp_auto_online) - return sprintf(buf, "online\n"); - else - return sprintf(buf, "offline\n"); + return sprintf(buf, "%s\n", + online_type_to_str[memhp_default_online_type]); }
static ssize_t auto_online_blocks_store(struct device *dev, @@ -484,9 +482,9 @@ static ssize_t auto_online_blocks_store(struct device *dev, const char *buf, size_t count) { if (sysfs_streq(buf, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (sysfs_streq(buf, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE; else return -EINVAL;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index dfb5d5b6f1063..1ba6a688094de 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -105,6 +105,8 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid);
extern bool memhp_auto_online; +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int memhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f13144a65b127..3b9d409c902b9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -68,8 +68,10 @@ void put_online_mems(void) bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE +int memhp_default_online_type = MMOP_OFFLINE; bool memhp_auto_online; #else +int memhp_default_online_type = MMOP_ONLINE; bool memhp_auto_online = true; #endif EXPORT_SYMBOL_GPL(memhp_auto_online); @@ -77,9 +79,9 @@ EXPORT_SYMBOL_GPL(memhp_auto_online); static int __init setup_memhp_default_state(char *str) { if (!strcmp(str, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (!strcmp(str, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE;
return 1; } @@ -1042,6 +1044,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg) { + mem->online_type = memhp_default_online_type; return device_online(&mem->dev); }
@@ -1114,7 +1117,7 @@ int __ref add_memory_resource(int nid, struct resource *res) mem_hotplug_done();
/* online pages if requested */ - if (memhp_auto_online) + if (memhp_default_online_type != MMOP_OFFLINE) walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, online_memory_block);
From: David Hildenbrand david@redhat.com
mainline inclusion from linux-5.7-rc1 commit 5f47adf762b78cae97de58d9ff01d2d44db09467 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
For now, distributions implement advanced udev rules to essentially - Don't online any hotplugged memory (s390x) - Online all memory to ZONE_NORMAL (e.g., most virt environments like hyperv) - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken care of (e.g., bare metal, special virt environments)
In summary: All memory is usually onlined the same way, however, the kernel always has to ask user space to come up with the same answer. E.g., Hyper-V always waits for a memory block to get onlined before continuing, otherwise it might end up adding memory faster than onlining it, which can result in strange OOM situations. This waiting slows down adding of a bigger amount of memory.
Let's allow to specify a default online_type, not just "online" and "offline". This allows distributions to configure the default online_type when booting up and be done with it.
We can now specify "offline", "online", "online_movable" and "online_kernel" via - "memhp_default_state=" on the kernel cmdline - /sys/devices/system/memory/auto_online_blocks just like we are able to specify for a single memory block via /sys/devices/system/memory/memoryX/state
Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Wei Yang richard.weiyang@gmail.com Reviewed-by: Baoquan He bhe@redhat.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Pankaj Gupta pankaj.gupta.linux@gmail.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Oscar Salvador osalvador@suse.de Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Eduardo Habkost ehabkost@redhat.com Cc: Haiyang Zhang haiyangz@microsoft.com Cc: Igor Mammedov imammedo@redhat.com Cc: "K. Y. Srinivasan" kys@microsoft.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: Stephen Hemminger sthemmin@microsoft.com Cc: Vitaly Kuznetsov vkuznets@redhat.com Cc: Wei Liu wei.liu@kernel.org Cc: Yumei Huang yuhuang@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-9-david@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/memory.c | 11 +++++------ include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f7c41be57976b..1b97f305173fb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -37,7 +37,7 @@ static const char *const online_type_to_str[] = { [MMOP_ONLINE_MOVABLE] = "online_movable", };
-static int memhp_online_type_from_str(const char *str) +int memhp_online_type_from_str(const char *str) { int i;
@@ -481,13 +481,12 @@ static ssize_t auto_online_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (sysfs_streq(buf, "offline")) - memhp_default_online_type = MMOP_OFFLINE; - else + const int online_type = memhp_online_type_from_str(buf); + + if (online_type < 0) return -EINVAL;
+ memhp_default_online_type = online_type; return count; }
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1ba6a688094de..bc433d459c861 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -105,6 +105,8 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid);
extern bool memhp_auto_online; +extern int memhp_online_type_from_str(const char *str); + /* Default online_type (MMOP_*) when new memory blocks are added. */ extern int memhp_default_online_type; /* If movable_node boot option specified */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3b9d409c902b9..3df9285b73b18 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -78,10 +78,10 @@ EXPORT_SYMBOL_GPL(memhp_auto_online);
static int __init setup_memhp_default_state(char *str) { - if (!strcmp(str, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (!strcmp(str, "offline")) - memhp_default_online_type = MMOP_OFFLINE; + const int online_type = memhp_online_type_from_str(str); + + if (online_type >= 0) + memhp_default_online_type = online_type;
return 1; }
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Make efi_print_memmap() public in preparation for adding fake memory support for architecture with efi support, eg, arm64.
Co-developed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/include/asm/efi.h | 1 - arch/x86/platform/efi/efi.c | 16 ---------------- drivers/firmware/efi/memmap.c | 16 ++++++++++++++++ include/linux/efi.h | 1 + 4 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index baa549f8e9188..00aa9de5abbb1 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -123,7 +123,6 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); extern pgd_t * __init efi_call_phys_prolog(void); extern void __init efi_call_phys_epilog(pgd_t *save_pgd); -extern void __init efi_print_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 9e5229139761b..a01663c9f740e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -278,22 +278,6 @@ static void __init efi_clean_memmap(void) } }
-void __init efi_print_memmap(void) -{ - efi_memory_desc_t *md; - int i = 0; - - for_each_efi_memory_desc(md) { - char buf[64]; - - pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", - i++, efi_md_typeattr_format(buf, sizeof(buf), md), - md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, - (md->num_pages >> (20 - EFI_PAGE_SHIFT))); - } -} - static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index 1907db2b38d81..16c160569e1a1 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -29,6 +29,22 @@ static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size) return PFN_PHYS(page_to_pfn(p)); }
+void __init efi_print_memmap(void) +{ + efi_memory_desc_t *md; + int i = 0; + + for_each_efi_memory_desc(md) { + char buf[64]; + + pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", + i++, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, + (md->num_pages >> (20 - EFI_PAGE_SHIFT))); + } +} + /** * efi_memmap_alloc - Allocate memory for the EFI memory map * @num_entries: Number of entries in the allocated map. diff --git a/include/linux/efi.h b/include/linux/efi.h index 34c255c2a487c..1582fa572651e 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1039,6 +1039,7 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md, struct range *range); extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, struct efi_mem_range *mem); +extern void __init efi_print_memmap(void);
extern int efi_config_init(efi_config_table_type_t *arch_tables); #ifdef CONFIG_EFI_ESRT
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Fake memory map is used for faking memory's attribute values. Commit 0f96a99dab36 ("efi: Add "efi_fake_mem" boot option") introduce the efi_fake_mem function. Now it can support arm64 with this patch. For example you can mark 0-6G memory as EFI_MEMORY_MORE_RELIABLE by adding efi_fake_mem=6G@0:0x10000 in the bootarg. You find more info about fake memmap in kernel-parameters.txt.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/arm64/mm/init.c | 3 +++ drivers/firmware/efi/Kconfig | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5623a9d1cf813..58eb696ccf13e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1159,7 +1159,7 @@ you are really sure that your UEFI does sane gc and fulfills the spec otherwise your board may brick.
- efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86] + efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86; ARM64] Add arbitrary attribute to specific memory range by updating original EFI memory map. Region of memory which aa attribute is added to is diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 8cdf92626c2c6..8eb44aec2c78e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -724,6 +724,9 @@ void __init arm64_memblock_init(void) else arm64_dma_phys_limit = PHYS_MASK + 1;
+ if (efi_enabled(EFI_MEMMAP)) + efi_fake_memmap(); + reserve_pin_memory_res();
/* diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 89110dfc7127c..65ae56c6c040f 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -54,7 +54,7 @@ config EFI_RUNTIME_MAP
config EFI_FAKE_MEMMAP bool "Enable EFI fake memory map" - depends on EFI && X86 + depends on EFI && (X86 || ARM64) default n help Saying Y here will enable "efi_fake_mem" boot option.
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Commit b05b9f5f9dcf ("x86, mirror: x86 enabling - find mirrored memory ranges") introduce the efi_find_mirror function on x86. In order to reuse the API we make it public in preparation for arm64 to support mirrord memory.
Co-developed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/platform/efi/efi.c | 20 -------------------- drivers/firmware/efi/efi.c | 20 ++++++++++++++++++++ include/linux/efi.h | 4 +++- 3 files changed, 23 insertions(+), 21 deletions(-)
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a01663c9f740e..b8cb57edd424a 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -101,26 +101,6 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; }
-void __init efi_find_mirror(void) -{ - efi_memory_desc_t *md; - u64 mirror_size = 0, total_size = 0; - - for_each_efi_memory_desc(md) { - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - - total_size += size; - if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { - memblock_mark_mirror(start, size); - mirror_size += size; - } - } - if (mirror_size) - pr_info("Memory: %lldM/%lldM mirrored memory\n", - mirror_size>>20, total_size>>20); -} - /* * Tell the kernel about the EFI memory map. This might include * more than the max 128 entries that can fit in the e820 legacy diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 0f3ce03c1116d..a159ae07d66f8 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -404,6 +404,26 @@ static int __init efisubsys_init(void)
subsys_initcall(efisubsys_init);
+void __init efi_find_mirror(void) +{ + efi_memory_desc_t *md; + u64 mirror_size = 0, total_size = 0; + + for_each_efi_memory_desc(md) { + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + total_size += size; + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + memblock_mark_mirror(start, size); + mirror_size += size; + } + } + if (mirror_size) + pr_info("Memory: %lldM/%lldM mirrored memory\n", + mirror_size>>20, total_size>>20); +} + /* * Find the efi memory descriptor for a given physical address. Given a * physical address, determine if it exists within an EFI Memory Map entry, diff --git a/include/linux/efi.h b/include/linux/efi.h index 1582fa572651e..b90423d1128bc 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1017,7 +1017,6 @@ extern void efi_free_boot_services(void); extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, bool nonblocking); -extern void efi_find_mirror(void); #else static inline void efi_free_boot_services(void) {}
@@ -1181,6 +1180,7 @@ static inline bool efi_enabled(int feature) extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
extern bool efi_is_table_address(unsigned long phys_addr); +extern void efi_find_mirror(void); #else static inline bool efi_enabled(int feature) { @@ -1199,6 +1199,8 @@ static inline bool efi_is_table_address(unsigned long phys_addr) { return false; } + +static inline void efi_find_mirror(void) {} #endif
extern int efi_status_to_err(efi_status_t status);
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Mirrored memory could be used on HiSilion's arm64 SoC. So efi_find_mirror() is added in efi_init() so that systems can get memblock about any mirrored ranges.
Co-developed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/arm64/mm/init.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 58eb696ccf13e..886c900323f14 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1944,7 +1944,7 @@
keepinitrd [HW,ARM]
- kernelcore= [KNL,X86,IA-64,PPC] + kernelcore= [KNL,X86,IA-64,PPC,ARM64] Format: nn[KMGTPE] | nn% | "mirror" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 8eb44aec2c78e..a3b3bfccd8221 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -724,8 +724,10 @@ void __init arm64_memblock_init(void) else arm64_dma_phys_limit = PHYS_MASK + 1;
- if (efi_enabled(EFI_MEMMAP)) + if (efi_enabled(EFI_MEMMAP)) { efi_fake_memmap(); + efi_find_mirror(); + }
reserve_pin_memory_res();
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Introduction Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com
============
Memory reliable feature is a memory tiering mechanism. It is based on kernel mirror feature, which splits memory into two sperate regions, mirrored(reliable) region and non-mirrored (non-reliable) region.
for kernel mirror feature:
- allocate kernel memory from mirrored region by default - allocate user memory from non-mirrored region by default
non-mirrored region will be arranged into ZONE_MOVABLE.
for kernel reliable feature, it has additional features below:
- normal user tasks never alloc memory from mirrored region with userspace apis(malloc, mmap, etc.) - special user tasks will allocate memory from mirrored region by default - tmpfs/pagecache allocate memory from mirrored region by default - upper limit of mirrored region allcated for user tasks, tmpfs and pagecache
Support Reliable fallback mechanism which allows special user tasks, tmpfs and pagecache can fallback to alloc non-mirrored region, it's the default setting.
In order to fulfil the goal
- ___GFP_RELIABILITY flag added for alloc memory from mirrored region.
- the high_zoneidx for special user tasks/tmpfs/pagecache is set to ZONE_NORMAL.
- normal user tasks could only alloc from ZONE_MOVABLE.
This patch is just the main framework, memory reliable support for special user tasks, pagecache and tmpfs has own patches.
To enable this function, mirrored(reliable) memory is needed and "kernelcore=reliable" should be added to kernel parameters.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 5 +- include/linux/gfp.h | 12 +++ include/linux/mem_reliable.h | 64 +++++++++++++++ include/linux/mm.h | 3 + mm/Kconfig | 18 +++++ mm/Makefile | 1 + mm/mem_reliable.c | 78 +++++++++++++++++++ mm/page_alloc.c | 46 ++++++++++- 8 files changed, 224 insertions(+), 3 deletions(-) create mode 100644 include/linux/mem_reliable.h create mode 100644 mm/mem_reliable.c
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 886c900323f14..cc5eec8959a07 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1945,7 +1945,7 @@ keepinitrd [HW,ARM]
kernelcore= [KNL,X86,IA-64,PPC,ARM64] - Format: nn[KMGTPE] | nn% | "mirror" + Format: nn[KMGTPE] | nn% | "mirror" | "reliable" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested amount is spread evenly throughout all nodes in the @@ -1969,6 +1969,9 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms.
+ Option "reliable" is base on option "mirror", but make + some extension. These two features are alternatives. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f78d1e89593fd..152cb9bdf4365 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -45,6 +45,12 @@ struct vm_area_struct; #define ___GFP_NOLOCKDEP 0 #endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ +#ifdef CONFIG_MEMORY_RELIABLE +/* add flag at the end of gfp_mask to aovid kapi change */ +#define ___GFP_RELIABILITY 0x40000000u +#else +#define ___GFP_RELIABILITY 0 +#endif
/* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -446,6 +452,12 @@ static inline enum zone_type gfp_zone(gfp_t flags) z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + +#ifdef CONFIG_MEMORY_RELIABLE + if (z == ZONE_MOVABLE && flags & ___GFP_RELIABILITY) + return ZONE_NORMAL; +#endif + return z; }
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h new file mode 100644 index 0000000000000..b03108441e37a --- /dev/null +++ b/include/linux/mem_reliable.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_MEM_RELIABLE__ +#define __MM_MEM_RELIABLE__ + +#include <linux/stddef.h> +#include <linux/gfp.h> +#include <linux/mmzone.h> +#include <linux/mm_types.h> +#include <linux/sched.h> + + +#ifdef CONFIG_MEMORY_RELIABLE + +extern struct static_key_false mem_reliable; + +extern bool reliable_enabled; + +extern void add_reliable_mem_size(long sz); +extern void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn); + +static inline bool mem_reliable_is_enabled(void) +{ + return static_branch_likely(&mem_reliable); +} + +static inline bool zone_reliable(struct zone *zone) +{ + return mem_reliable_is_enabled() && zone_idx(zone) < ZONE_MOVABLE; +} + +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!current->mm || (current->flags & PF_KTHREAD)) + return false; + + /* user tasks can only alloc memory from non-mirrored region */ + if (!(gfp & ___GFP_RELIABILITY) && (gfp & __GFP_HIGHMEM) && + (gfp & __GFP_MOVABLE)) { + if (zonelist_zone_idx(z) < ZONE_MOVABLE) + return true; + } + + return false; +} +#else +#define reliable_enabled 0 + +static inline bool mem_reliable_is_enabled(void) { return false; } +static inline void add_reliable_mem_size(long sz) {} +static inline void mem_reliable_init(bool has_unmirrored_mem, + unsigned long *zone_movable_pfn) {} +static inline bool zone_reliable(struct zone *zone) { return false; } +static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) +{ + return false; +} + +#endif + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index be0be448c3f19..630b103065f4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -28,6 +28,9 @@ #include <linux/memremap.h> #include <linux/overflow.h>
+/* added to mm.h to avoid every caller adding new header file */ +#include <linux/mem_reliable.h> + struct mempolicy; struct anon_vma; struct anon_vma_chain; diff --git a/mm/Kconfig b/mm/Kconfig index 12601505c4a4a..80d7b47ca9f53 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -831,4 +831,22 @@ config PID_RESERVE We record the pid of dump task in the reserve memory, and reserve the pids before init task start. In restore process, free the reserved pids and realloc them for use. + +config MEMORY_RELIABLE + bool "Support for memory reliable" + depends on ARM64 + default n + help + Memory reliable is based on mirror memory. It has the following + additional features: + a) normal user tasks never alloc memory from mirrored region; + b) special user tasks will allocate memory from mirrored region + by default; c) upper limit of mirrored region allcated for user + tasks, tmpfs and pagecache. + Special user tasks and tmpfs/pagecache can fallback to + non-mirrored region if you enable reliable fallback mechanism. + + To enable this function, mirrored memory is needed and + "kernelcore=reliable" need to be added in kernel parameters. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 8fba091be3868..741f9c250914c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,3 +109,4 @@ obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c new file mode 100644 index 0000000000000..2e21839ca49fb --- /dev/null +++ b/mm/mem_reliable.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "mem reliable: " fmt + + +#include <linux/mm.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> + +DEFINE_STATIC_KEY_FALSE(mem_reliable); + +bool reliable_enabled; + +static atomic_long_t total_reliable_mem; + +void add_reliable_mem_size(long sz) +{ + atomic_long_add(sz, &total_reliable_mem); +} + +static int reliable_mem_notifier(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct memory_notify *m_arg = arg; + struct zone *zone; + + switch (action) { + case MEM_ONLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(m_arg->nr_pages * PAGE_SIZE); + break; + case MEM_OFFLINE: + zone = page_zone(pfn_to_page(m_arg->start_pfn)); + if (zone_reliable(zone)) + add_reliable_mem_size(-m_arg->nr_pages * PAGE_SIZE); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block reliable_notifier_block = { + .notifier_call = reliable_mem_notifier, +}; + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) +{ + if (!reliable_enabled) + return; + + if (atomic_long_read(&total_reliable_mem) == 0) { + memset(zone_movable_pfn, 0, + sizeof(unsigned long) * MAX_NUMNODES); + + pr_err("init failed, mirrored memory size is zero."); + + return; + } + + if (!has_unmirrored_mem) { + pr_err("init failed, unmirrored memory size is zero."); + + return; + } + + if (register_hotmemory_notifier(&reliable_notifier_block)) { + pr_err("init failed, register memory notifier failed."); + return; + } + + static_branch_enable(&mem_reliable); + + pr_info("init succeed, mirrored memory size(%lu)", + atomic_long_read(&total_reliable_mem)); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4cad86f1e3a91..e1e513e851dec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3454,6 +3454,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct page *page; unsigned long mark;
+ /* skip non-movable zone for normal user tasks */ + if (skip_none_movable_zone(gfp_mask, z)) + continue; + /* * CDM nodes get skipped if the requested gfp flag * does not have __GFP_THISNODE set or the nodemask @@ -4557,6 +4561,18 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) ac->high_zoneidx, ac->nodemask); }
+static inline void prepare_before_alloc(gfp_t *gfp_mask) +{ + gfp_t gfp_ori = *gfp_mask; + *gfp_mask &= gfp_allowed_mask; + + if (!mem_reliable_is_enabled()) + return; + + if (gfp_ori & ___GFP_RELIABILITY) + *gfp_mask |= ___GFP_RELIABILITY; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -4578,7 +4594,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, return NULL; }
- gfp_mask &= gfp_allowed_mask; + prepare_before_alloc(&gfp_mask); + alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; @@ -6912,10 +6929,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + bool has_unmirrored_mem = false;
for_each_memblock(memory, r) { - if (memblock_is_mirror(r)) + if (memblock_is_mirror(r)) { + add_reliable_mem_size(r->size); continue; + }
nid = r->nid;
@@ -6926,6 +6946,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) continue; }
+ has_unmirrored_mem = true; zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -6934,6 +6955,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory.");
+ mem_reliable_init(has_unmirrored_mem, zone_movable_pfn); + goto out2; }
@@ -7226,9 +7249,28 @@ static int __init cmdline_parse_kernelcore(char *p) { /* parse kernelcore=mirror */ if (parse_option_str(p, "mirror")) { + if (reliable_enabled) { + pr_info("kernelcore=reliable and kernelcore=mirror are alternative."); + return -EINVAL; + } + + mirrored_kernelcore = true; + return 0; + } + +#ifdef CONFIG_MEMORY_RELIABLE + /* parse kernelcore=reliable */ + if (parse_option_str(p, "reliable")) { + if (!reliable_enabled && mirrored_kernelcore) { + pr_info("kernelcore=mirror and kernelcore=reliable are alternative."); + return -EINVAL; + } + + reliable_enabled = true; mirrored_kernelcore = true; return 0; } +#endif
return cmdline_parse_core(p, &required_kernelcore, &required_kernelcore_percent);
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Add ReliMemTotal & ReliMemUsed in /proc/meminfo to show memory info about reliable memory.
- ReliableTotal: total reliable RAM
- ReliableUsed: thei used amount of reliable memory kernel
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/filesystems/proc.txt | 4 ++++ fs/proc/meminfo.c | 2 ++ include/linux/mem_reliable.h | 2 ++ mm/mem_reliable.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+)
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d0ecc7df2600..690db5b3eb53b 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -880,6 +880,8 @@ HardwareCorrupted: 0 kB AnonHugePages: 49152 kB ShmemHugePages: 0 kB ShmemPmdMapped: 0 kB +ReliableTotal: 7340032 kB +ReliableUsed: 418824 kB
MemTotal: Total usable ram (i.e. physical ram minus a few reserved @@ -970,6 +972,8 @@ VmallocTotal: total size of vmalloc memory area VmallocChunk: largest contiguous block of vmalloc area which is free Percpu: Memory allocated to the percpu allocator used to back percpu allocations. This stat excludes the cost of metadata. +ReliableTotal: Total reliable memory size +ReliableUsed: The used amount of reliable memory
..............................................................................
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index edda898714eb7..883c5f53c303f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -148,6 +148,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
arch_report_meminfo(m);
+ reliable_report_meminfo(m); + return 0; }
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index b03108441e37a..31be68fac330b 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -18,6 +18,7 @@ extern bool reliable_enabled; extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn); +extern void reliable_report_meminfo(struct seq_file *m);
static inline bool mem_reliable_is_enabled(void) { @@ -58,6 +59,7 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { return false; } +static inline void reliable_report_meminfo(struct seq_file *m) {}
#endif
diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 2e21839ca49fb..c03c77090cf5b 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -6,6 +6,8 @@ #include <linux/mm.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/seq_file.h> +#include <linux/mmzone.h>
DEFINE_STATIC_KEY_FALSE(mem_reliable);
@@ -76,3 +78,30 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) pr_info("init succeed, mirrored memory size(%lu)", atomic_long_read(&total_reliable_mem)); } + +static unsigned long total_reliable_mem_sz(void) +{ + return atomic_long_read(&total_reliable_mem); +} + +static unsigned long used_reliable_mem_sz(void) +{ + unsigned long nr_page = 0; + struct zone *z; + + for_each_populated_zone(z) + if (zone_idx(z) < ZONE_MOVABLE) + nr_page += zone_page_state(z, NR_FREE_PAGES); + + return total_reliable_mem_sz() - nr_page * PAGE_SIZE; +} + +void reliable_report_meminfo(struct seq_file *m) +{ + if (mem_reliable_is_enabled()) { + seq_printf(m, "ReliableTotal: %8lu kB\n", + total_reliable_mem_sz() >> 10); + seq_printf(m, "ReliableUsed: %8lu kB\n", + used_reliable_mem_sz() >> 10); + } +}
From: Peng Wu wupeng58@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
------------------------------------------
Adding reliable flag for user task. User task with reliable flag can only alloc memory from mirrored region. PF_RELIABLE is added to represent the task's reliable flag.
- For init task, which is regarded as as special task which alloc memory from mirrored region.
- For normal user tasks, The reliable flag can be set via procfs interface shown as below and can be inherited via fork().
User can change a user task's reliable flag by
$ echo [0/1] > /proc/<pid>/reliable
and check a user task's reliable flag by
$ cat /proc/<pid>/reliable
Note, global init task's reliable file can not be accessed.
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/base.c | 96 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 1 + mm/page_alloc.c | 3 ++ 3 files changed, 100 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 349c01c68e576..d7e94f7b5ad3e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1222,6 +1222,96 @@ static const struct file_operations proc_oom_score_adj_operations = { .llseek = default_llseek, };
+#ifdef CONFIG_MEMORY_RELIABLE +static inline int reliable_check(struct task_struct *task, struct pid *pid) +{ + if (!mem_reliable_is_enabled()) + return -EPERM; + + if (is_global_init(task)) + return -EPERM; + + if (!task->mm || (task->flags & PF_KTHREAD) || + (task->flags & PF_EXITING)) + return -EPERM; + + return 0; +} + +static ssize_t reliable_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct pid *pid = proc_pid(file_inode(file)); + char buffer[PROC_NUMBUF]; + size_t len; + short val; + int err; + + if (!task) + return -ESRCH; + + err = reliable_check(task, pid); + if (err) { + put_task_struct(task); + return err; + } + + val = task->flags & PF_RELIABLE ? 1 : 0; + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%hd\n", val); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t reliable_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct pid *pid = proc_pid(file_inode(file)); + char buffer[PROC_NUMBUF]; + int val; + int err; + + if (!task) + return -ESRCH; + + err = reliable_check(task, pid); + if (err) + goto out; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &val); + if (err) + goto out; + if (val != 0 && val != 1) { + err = -EINVAL; + goto out; + } + + if (val == 1) + task->flags |= PF_RELIABLE; + else + task->flags &= ~PF_RELIABLE; + +out: + put_task_struct(task); + return err < 0 ? err : count; +} + +static const struct file_operations proc_reliable_operations = { + .read = reliable_read, + .write = reliable_write, + .llseek = generic_file_llseek, +}; +#endif + #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, @@ -3029,6 +3119,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_MEMORY_RELIABLE + REG("reliable", S_IRUGO|S_IWUSR, proc_reliable_operations), +#endif #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), @@ -3419,6 +3512,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_MEMORY_RELIABLE + REG("reliable", S_IRUGO|S_IWUSR, proc_reliable_operations), +#endif #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), diff --git a/include/linux/sched.h b/include/linux/sched.h index 945a57ecd9a51..677cb6ace36f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1421,6 +1421,7 @@ extern struct pid *cad_pid; */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ +#define PF_RELIABLE 0x00000008 /* Allocate from reliable memory */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1e513e851dec..95d2450cf1771 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4571,6 +4571,9 @@ static inline void prepare_before_alloc(gfp_t *gfp_mask)
if (gfp_ori & ___GFP_RELIABILITY) *gfp_mask |= ___GFP_RELIABILITY; + + if (current->flags & PF_RELIABLE || is_global_init(current)) + *gfp_mask |= ___GFP_RELIABILITY; }
/*
From: Peng Wu wupeng58@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
-------------------------------------------------
Adding an variable in mm_struct for accouting the amount of reliable memory allocated by the reliable user tasks.
Use KABI_RESERVE(3) in mm_struct to avoid any kapi changes.
Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ae4237a59d21c..f2142a5eab6ef 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -529,7 +529,12 @@ struct mm_struct { KABI_RESERVE(2) #endif
+#if IS_ENABLED(CONFIG_MEMORY_RELIABLE) && !defined(__GENKSYMS__) + atomic_long_t reliable_nr_page; /* total used reliable pages */ +#else KABI_RESERVE(3) +#endif + KABI_RESERVE(4) KABI_RESERVE(5) KABI_RESERVE(6)
From: Peng Wu wupeng58@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
----------------------------------------------
Counting reliable memory allocated by the reliable user tasks.
The policy of counting reliable memory usage is based on RSS statistics. Anywhere with counter of mm need count reliable pages too. Reliable page which is checked by page_reliable() need to update the reliable page counter by calling reliable_page_counter().
Updating the reliable pages should be considered if the following logic is added: - add_mm_counter - dec_mm_counter - inc_mm_counter_fast - dec_mm_counter_fast - rss[mm_counter(page)]
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/filesystems/proc.txt | 2 ++ fs/proc/task_mmu.c | 1 + include/linux/mem_reliable.h | 17 +++++++++++++++++ kernel/events/uprobes.c | 2 ++ mm/huge_memory.c | 8 ++++++++ mm/khugepaged.c | 1 + mm/ksm.c | 1 + mm/mem_reliable.c | 15 ++++++++++++++- mm/memory.c | 16 ++++++++++++++++ mm/migrate.c | 1 + mm/rmap.c | 5 +++++ mm/shmem.c | 1 + mm/swapfile.c | 1 + mm/userfaultfd.c | 1 + 14 files changed, 71 insertions(+), 1 deletion(-)
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 690db5b3eb53b..1ef781f33b376 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -181,6 +181,7 @@ read the file /proc/PID/status: VmPTE: 20 kb VmSwap: 0 kB HugetlbPages: 0 kB + Reliable: 1608 KB CoreDumping: 0 Threads: 1 SigQ: 0/28578 @@ -254,6 +255,7 @@ Table 1-2: Contents of the status files (as of 4.8) VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions + Reliable size of reliable memory used CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) Threads number of threads diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 495044e1990bd..78ce353d0dfad 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -77,6 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); + reliable_report_usage(m, mm); } #undef SEQ_PUT_DEC
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 31be68fac330b..a18a843c7b52f 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -14,11 +14,14 @@ extern struct static_key_false mem_reliable;
extern bool reliable_enabled; +extern atomic_long_t reliable_user_used_nr_page;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn); extern void reliable_report_meminfo(struct seq_file *m); +extern bool page_reliable(struct page *page); +extern void reliable_report_usage(struct seq_file *m, struct mm_struct *mm);
static inline bool mem_reliable_is_enabled(void) { @@ -47,6 +50,15 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z)
return false; } + +static inline void reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) +{ + if (page_reliable(page)) { + atomic_long_add(val, &mm->reliable_nr_page); + atomic_long_add(val, &reliable_user_used_nr_page); + } +} #else #define reliable_enabled 0
@@ -60,6 +72,11 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) return false; } static inline void reliable_report_meminfo(struct seq_file *m) {} +static inline bool page_reliable(struct page *page) { return false; } +static inline void reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) {} +static inline void reliable_report_usage(struct seq_file *m, + struct mm_struct *mm) {}
#endif
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c173e4131df88..de64e29830824 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -191,7 +191,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (!PageAnon(old_page)) { dec_mm_counter(mm, mm_counter_file(old_page)); + reliable_page_counter(old_page, mm, -1); inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(new_page, mm, 1); }
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 31f1c580ba9c0..f8319265c1cf3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -673,6 +673,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); @@ -1080,6 +1081,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(src_page, dst_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
@@ -1468,6 +1470,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(new_page, vma->vm_mm, + HPAGE_PMD_NR); } else { VM_BUG_ON_PAGE(!PageHead(page), page); page_remove_rmap(page, true); @@ -1850,10 +1854,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (PageAnon(page)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + reliable_page_counter(page, tlb->mm, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); + reliable_page_counter(page, tlb->mm, -HPAGE_PMD_NR); }
spin_unlock(ptl); @@ -2209,6 +2215,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + reliable_page_counter(page, mm, -HPAGE_PMD_NR); return; }
@@ -3170,6 +3177,7 @@ vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long add pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable); set_pmd_at(vma->vm_mm, address, pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(ptl); count_vm_event(THP_FAULT_ALLOC); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 55f171ed2d08a..5ac2486327528 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -648,6 +648,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + reliable_page_counter(page, vma->vm_mm, 1); if (is_zero_pfn(pte_pfn(pteval))) { /* * ptl mostly unnecessary. diff --git a/mm/ksm.c b/mm/ksm.c index 9749729a5381a..b656fa77f92ff 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1184,6 +1184,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * when tearing down the mm. */ dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); }
flush_cache_page(vma, addr, pte_pfn(*ptep)); diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index c03c77090cf5b..d6aec08638923 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -12,14 +12,19 @@ DEFINE_STATIC_KEY_FALSE(mem_reliable);
bool reliable_enabled; - static atomic_long_t total_reliable_mem; +atomic_long_t reliable_user_used_nr_page;
void add_reliable_mem_size(long sz) { atomic_long_add(sz, &total_reliable_mem); }
+bool page_reliable(struct page *page) +{ + return mem_reliable_is_enabled() && page_zonenum(page) < ZONE_MOVABLE; +} + static int reliable_mem_notifier(struct notifier_block *nb, unsigned long action, void *arg) { @@ -105,3 +110,11 @@ void reliable_report_meminfo(struct seq_file *m) used_reliable_mem_sz() >> 10); } } + +void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + if (mem_reliable_is_enabled()) { + seq_printf(m, "Reliable:\t%8lu kB\n", + atomic_long_read(&mm->reliable_nr_page)); + } +} diff --git a/mm/memory.c b/mm/memory.c index 054e62292902a..d4853970a7c10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -740,6 +740,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
rss[mm_counter(page)]++;
+ reliable_page_counter(page, dst_mm, 1); if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* @@ -766,6 +767,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ get_page(page); rss[mm_counter(page)]++; + reliable_page_counter(page, dst_mm, 1); page_dup_rmap(page, false);
/* @@ -807,6 +809,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + reliable_page_counter(page, dst_mm, 1); } else if (pte_devmap(pte)) { page = pte_page(pte);
@@ -819,6 +822,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + reliable_page_counter(page, dst_mm, 1); } }
@@ -1102,6 +1106,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); } rss[mm_counter(page)]--; + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); @@ -1130,6 +1135,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); put_page(page); continue; @@ -1147,6 +1153,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
page = migration_entry_to_page(entry); rss[mm_counter(page)]--; + reliable_page_counter(page, mm, -1); } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); @@ -1490,6 +1497,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); inc_mm_counter_fast(mm, mm_counter_file(page)); + reliable_page_counter(page, mm, 1); page_add_file_rmap(page, false); set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -2489,10 +2497,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, mm_counter_file(old_page)); + reliable_page_counter(old_page, mm, -1); inc_mm_counter_fast(mm, MM_ANONPAGES); + reliable_page_counter(new_page, mm, 1); } } else { inc_mm_counter_fast(mm, MM_ANONPAGES); + reliable_page_counter(new_page, mm, 1); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); @@ -3051,6 +3062,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { @@ -3216,6 +3228,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) }
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); @@ -3416,6 +3429,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held @@ -3489,6 +3503,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ + reliable_page_counter(page, vma->vm_mm, 1); if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); @@ -4910,6 +4925,7 @@ vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, if (ret) goto release; inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); page_add_new_anon_rmap(page, vma, address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); diff --git a/mm/migrate.c b/mm/migrate.c index 90aa493faa602..eb27e8e2bf213 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2714,6 +2714,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, }
inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); page_add_new_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, false, false); if (!is_zone_device_page(page)) diff --git a/mm/rmap.c b/mm/rmap.c index 7debdf0cc6785..224fac084ad0e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1548,6 +1548,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + reliable_page_counter(page, mm, -1); set_pte_at(mm, address, pvmw.pte, pteval); }
@@ -1563,6 +1564,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(page)); + reliable_page_counter(page, mm, -1); /* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1617,6 +1619,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); goto discard; }
@@ -1650,6 +1653,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) @@ -1670,6 +1674,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(page)); + reliable_page_counter(page, mm, -1); } discard: /* diff --git a/mm/shmem.c b/mm/shmem.c index 8d32d49a4d7ba..16bb7806a25e6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2473,6 +2473,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, spin_unlock_irq(&info->lock);
inc_mm_counter(dst_mm, mm_counter_file(page)); + reliable_page_counter(page, dst_mm, 1); page_add_file_rmap(page, false); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/swapfile.c b/mm/swapfile.c index 4028994a51ae6..2619729400d32 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1869,6 +1869,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 1c86abd41c6d7..c26dd2040624f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -116,6 +116,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, #endif
inc_mm_counter(dst_mm, MM_ANONPAGES); + reliable_page_counter(page, dst_mm, 1); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma);
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Hugepaged collapse pages into huge page will use the same memory region. When hugepaged collapse pages into huge page, hugepaged will check if there is any reliable pages in the area to be collapsed. If this area contains any reliable pages, hugepaged will alloc memory from mirrored region. Otherwise it will alloc momory from non-mirrored region.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/khugepaged.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5ac2486327528..7f37633a886e0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -950,7 +950,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced, int unmapped) + int node, int referenced, int unmapped, + bool reliable) { pmd_t *pmd, _pmd; pte_t *pte; @@ -969,6 +970,9 @@ static void collapse_huge_page(struct mm_struct *mm, /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+ if (reliable) + gfp |= ___GFP_RELIABILITY; + /* * Before allocating the hugepage, release the mmap_sem read lock. * The allocation can take potentially a long time if it involves @@ -1127,6 +1131,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; + bool reliable = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1215,6 +1220,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced++; + + if (page_reliable(page)) + reliable = true; } if (!writable) { result = SCAN_PAGE_RO; @@ -1230,7 +1238,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + referenced, unmapped, reliable); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1324,7 +1332,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ static void collapse_shmem(struct mm_struct *mm, struct address_space *mapping, pgoff_t start, - struct page **hpage, int node) + struct page **hpage, int node, bool reliable) { gfp_t gfp; struct page *page, *new_page, *tmp; @@ -1340,6 +1348,9 @@ static void collapse_shmem(struct mm_struct *mm, /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+ if (reliable) + gfp |= ___GFP_RELIABILITY; + new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; @@ -1613,6 +1624,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; + bool reliable = false;
present = 0; swap = 0; @@ -1670,6 +1682,9 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); } + + if (page_reliable(page)) + reliable = true; } rcu_read_unlock();
@@ -1678,7 +1693,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); - collapse_shmem(mm, mapping, start, hpage, node); + collapse_shmem(mm, mapping, start, hpage, node, + reliable); } }
From: Peng Wu wupeng58@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
----------------------------------------------
there is a upper limit for special user tasks's memory allocation. special user task means user task with reliable flag.
Init tasks will alloc memory from non-mirrored region if their allocation trigger limit.
The limit can be set or access via /proc/sys/vm/task_reliable_limit
This limit's default value is ULONG_MAX.
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mem_reliable.h | 20 +++++++++ lib/show_mem.c | 1 + mm/khugepaged.c | 14 ++++++ mm/mem_reliable.c | 86 ++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 50 ++++++++++++++++++--- 5 files changed, 164 insertions(+), 7 deletions(-)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index a18a843c7b52f..e4097f0cff679 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -15,6 +15,7 @@ extern struct static_key_false mem_reliable;
extern bool reliable_enabled; extern atomic_long_t reliable_user_used_nr_page; +extern unsigned long task_reliable_limit __read_mostly;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, @@ -22,6 +23,9 @@ extern void mem_reliable_init(bool has_unmirrored_mem, extern void reliable_report_meminfo(struct seq_file *m); extern bool page_reliable(struct page *page); extern void reliable_report_usage(struct seq_file *m, struct mm_struct *mm); +extern void reliable_show_mem_info(void); +extern void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask);
static inline bool mem_reliable_is_enabled(void) { @@ -59,6 +63,12 @@ static inline void reliable_page_counter(struct page *page, atomic_long_add(val, &reliable_user_used_nr_page); } } + +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return atomic_long_read(&reliable_user_used_nr_page) + nr_page <= + task_reliable_limit / PAGE_SIZE; +} #else #define reliable_enabled 0
@@ -78,6 +88,16 @@ static inline void reliable_page_counter(struct page *page, static inline void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) {}
+static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return false; +} +static inline void reliable_show_mem_info(void) {} +static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, + unsigned int order, + int preferred_nid, + nodemask_t *nodemask) {} + #endif
#endif diff --git a/lib/show_mem.c b/lib/show_mem.c index 0beaa1d899aae..0f85331ba91b9 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -49,4 +49,5 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif + reliable_show_mem_info(); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7f37633a886e0..2b154ff6ee734 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1235,6 +1235,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { + if (reliable && + !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + ret = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, node, @@ -1692,6 +1698,12 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; } else { + if (reliable && + !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + node = khugepaged_find_target_node(); collapse_shmem(mm, mapping, start, hpage, node, reliable); @@ -1699,6 +1711,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, }
/* TODO: tracepoints */ +out: + return; } #else static void khugepaged_scan_shmem(struct mm_struct *mm, diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index d6aec08638923..c24c5b7cbca33 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -8,12 +8,15 @@ #include <linux/memory_hotplug.h> #include <linux/seq_file.h> #include <linux/mmzone.h> +#include <linux/oom.h>
DEFINE_STATIC_KEY_FALSE(mem_reliable);
bool reliable_enabled; static atomic_long_t total_reliable_mem; atomic_long_t reliable_user_used_nr_page; +/* reliable user limit for user tasks with reliable flag */ +unsigned long task_reliable_limit = ULONG_MAX;
void add_reliable_mem_size(long sz) { @@ -118,3 +121,86 @@ void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) atomic_long_read(&mm->reliable_nr_page)); } } + +#ifdef CONFIG_SYSCTL +int reliable_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long old = task_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (task_reliable_limit > total_reliable_mem_sz()) { + task_reliable_limit = old; + return -EINVAL; + } + } + + return ret; +} + +static struct ctl_table reliable_ctl_table[] = { + { + .procname = "task_reliable_limit", + .data = &task_reliable_limit, + .maxlen = sizeof(task_reliable_limit), + .mode = 0644, + .proc_handler = reliable_limit_handler, + }, + {} +}; + +static struct ctl_table reliable_dir_table[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = reliable_ctl_table, + }, + {} +}; + +static int __init reliable_sysctl_init(void) +{ + if (!mem_reliable_is_enabled()) + return 0; + + if (!register_sysctl_table(reliable_dir_table)) { + pr_err("register sysctl failed."); + return -1; + } + + return 0; +} +late_initcall(reliable_sysctl_init); +#endif + +void reliable_show_mem_info(void) +{ + if (mem_reliable_is_enabled()) { + pr_info("ReliableTotal: %lu kB", total_reliable_mem_sz() >> 10); + pr_info("ReliableUsed: %lu kB", used_reliable_mem_sz() >> 10); + pr_info("task_reliable_limit: %lu kB", + task_reliable_limit >> 10); + pr_info("reliable_user_used: %ld kB", + atomic_long_read(&reliable_user_used_nr_page) * 4); + } +} + +void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct oom_control oc = { + .zonelist = node_zonelist(preferred_nid, gfp_mask), + .nodemask = nodemask, + .memcg = NULL, + .gfp_mask = gfp_mask, + .order = order, + }; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 95d2450cf1771..bfc0c2d1825cd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4561,19 +4561,51 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) ac->high_zoneidx, ac->nodemask); }
-static inline void prepare_before_alloc(gfp_t *gfp_mask) +/* + * return false means this allocation is limit by reliable user limit and + * this will lead to pagefault_out_of_memory() + */ +static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) { gfp_t gfp_ori = *gfp_mask; *gfp_mask &= gfp_allowed_mask;
if (!mem_reliable_is_enabled()) - return; + return true;
- if (gfp_ori & ___GFP_RELIABILITY) + if (gfp_ori & ___GFP_RELIABILITY) { *gfp_mask |= ___GFP_RELIABILITY; + return true; + }
- if (current->flags & PF_RELIABLE || is_global_init(current)) - *gfp_mask |= ___GFP_RELIABILITY; + /* + * Init tasks will alloc memory from non-mirrored region if their + * allocation trigger task_reliable_limit + */ + if (is_global_init(current)) { + if (reliable_mem_limit_check(1 << order)) + *gfp_mask |= ___GFP_RELIABILITY; + return true; + } + + /* + * This only check task_reliable_limit without ___GFP_RELIABILITY + * or this process is global init. + * For kernel internal mechanism(hugepaged collapse and others) + * If they alloc memory for user and obey task_reliable_limit, they + * need to check this limit before allocing pages. + */ + if ((current->flags & PF_RELIABLE) && (gfp_ori & __GFP_HIGHMEM) && + (gfp_ori & __GFP_MOVABLE)) { + if (reliable_mem_limit_check(1 << order)) { + *gfp_mask |= ___GFP_RELIABILITY; + return true; + } + + return false; + } + + return true; }
/* @@ -4583,7 +4615,7 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask) { - struct page *page; + struct page *page = NULL; unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; @@ -4597,7 +4629,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, return NULL; }
- prepare_before_alloc(&gfp_mask); + if (!prepare_before_alloc(&gfp_mask, order)) { + mem_reliable_out_of_memory(gfp_mask, order, preferred_nid, + nodemask); + goto out; + }
alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Introduce fallback mechanism for memory reliable. The following process will fallback to non-mirrored region if their allocation from mirrored region failed
- User tasks with reliable flag - thp collapse pages - init tasks - pagecache - tmpfs
In order to achieve this goals. Buddy system will fallback to non-mirrored in the following situations.
- if __GFP_THISNODE is set in gfp_mask and dest nodes do not have any zones available
- high_zoneidx will be set to ZONE_MOVABLE to alloc memory before oom
This mechanism is enabled by defalut and can be disabled by adding "reliable_debug=F" to the kernel parameters. This mechanism rely on CONFIG_MEMORY_RELIABLE and need "kernelcore=reliable" in the kernel parameters.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 9 +++ include/linux/mem_reliable.h | 7 ++ mm/khugepaged.c | 30 +++++--- mm/mem_reliable.c | 29 ++++++++ mm/page_alloc.c | 70 ++++++++++++++++++- 5 files changed, 133 insertions(+), 12 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cc5eec8959a07..3fc729aab31a6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1972,6 +1972,15 @@ Option "reliable" is base on option "mirror", but make some extension. These two features are alternatives.
+ reliable_debug= [ARM64] + Format: [F] + Only works with CONFIG_MEMORY_RELIABLE and + "kernelcore=reliable" is configured. + F: User tasks with PF_RELIABLE will not allocate + memory from non-mirrored region if this allocation + from mirrored region failed. + Pagecache and tmpfs will follow this rule too. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index e4097f0cff679..c9c4d94a4df46 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -16,6 +16,7 @@ extern struct static_key_false mem_reliable; extern bool reliable_enabled; extern atomic_long_t reliable_user_used_nr_page; extern unsigned long task_reliable_limit __read_mostly; +extern bool reliable_allow_fallback;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, @@ -69,8 +70,14 @@ static inline bool reliable_mem_limit_check(unsigned long nr_page) return atomic_long_read(&reliable_user_used_nr_page) + nr_page <= task_reliable_limit / PAGE_SIZE; } + +static inline bool reliable_allow_fb_enabled(void) +{ + return reliable_allow_fallback; +} #else #define reliable_enabled 0 +#define reliable_allow_fb_enabled() false
static inline bool mem_reliable_is_enabled(void) { return false; } static inline void add_reliable_mem_size(long sz) {} diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2b154ff6ee734..c9be18c669a17 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1235,10 +1235,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { - if (reliable && - !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { - ret = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out; + if (reliable) { + if (!reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + if (reliable_allow_fb_enabled()) { + reliable = false; + } else { + ret = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + } }
node = khugepaged_find_target_node(); @@ -1695,15 +1700,20 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, rcu_read_unlock();
if (result == SCAN_SUCCEED) { + if (reliable) { + if (!reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { + if (reliable_allow_fb_enabled()) { + reliable = false; + } else { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + } + } + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; } else { - if (reliable && - !reliable_mem_limit_check(1 << HPAGE_PMD_ORDER)) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out; - } - node = khugepaged_find_target_node(); collapse_shmem(mm, mapping, start, hpage, node, reliable); diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index c24c5b7cbca33..60a214e3b28f7 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -17,6 +17,7 @@ static atomic_long_t total_reliable_mem; atomic_long_t reliable_user_used_nr_page; /* reliable user limit for user tasks with reliable flag */ unsigned long task_reliable_limit = ULONG_MAX; +bool reliable_allow_fallback __read_mostly = true;
void add_reliable_mem_size(long sz) { @@ -204,3 +205,31 @@ void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, out_of_memory(&oc); mutex_unlock(&oom_lock); } + +static int __init setup_reliable_debug(char *str) +{ + if (*str++ != '=' || !*str) + /* + * No options specified. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for (; *str && *str != ','; str++) { + switch (*str) { + case 'F': + reliable_allow_fallback = false; + pr_info("fallback disabled."); + break; + default: + pr_err("reliable_debug option '%c' unknown. skipped\n", + *str); + } + } + +out: + return 1; +} +__setup("reliable_debug", setup_reliable_debug); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bfc0c2d1825cd..455525e49c727 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3656,6 +3656,60 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, return page; }
+#ifdef CONFIG_MEMORY_RELIABLE +static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return NULL; + + /* dst nodemask may don't have zone we want, fallback here */ + if ((gfp_mask & __GFP_THISNODE) && (ac->high_zoneidx == ZONE_NORMAL) && + (gfp_mask & ___GFP_RELIABILITY)) { + struct zoneref *ref = first_zones_zonelist( + ac->zonelist, ZONE_MOVABLE, ac->nodemask); + return ref->zone; + } + + return NULL; +} + +static inline struct page * +reliable_fb_before_oom(gfp_t gfp_mask, int order, + const struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return NULL; + + /* key user process alloc mem from movable zone to avoid oom */ + if ((ac->high_zoneidx == ZONE_NORMAL) && + (gfp_mask & ___GFP_RELIABILITY)) { + struct alloc_context tmp_ac = *ac; + + tmp_ac.high_zoneidx = ZONE_MOVABLE; + tmp_ac.preferred_zoneref = first_zones_zonelist( + ac->zonelist, ZONE_MOVABLE, ac->nodemask); + return get_page_from_freelist( + (gfp_mask | __GFP_HARDWALL) & ~__GFP_DIRECT_RECLAIM, + order, ALLOC_WMARK_HIGH | ALLOC_CPUSET, &tmp_ac); + } + + return NULL; +} +#else +static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, + struct alloc_context *ac) +{ + return NULL; +} + +static inline struct page *reliable_fb_before_oom(gfp_t gfp_mask, int order, + const struct alloc_context *ac) +{ + return NULL; +} +#endif + static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -3694,6 +3748,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out;
+ page = reliable_fb_before_oom(gfp_mask, order, ac); + if (page) + goto out; + /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) goto out; @@ -4301,8 +4359,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) - goto nopage; + if (!ac->preferred_zoneref->zone) { + ac->preferred_zoneref->zone = + reliable_fb_find_zone(gfp_mask, ac); + + if (!ac->preferred_zoneref->zone) + goto nopage; + }
if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, gfp_mask, ac); @@ -4602,6 +4665,9 @@ static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) return true; }
+ if (reliable_allow_fb_enabled()) + return true; + return false; }
From: Zhou Guanghui zhouguanghui1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
------------------------------------------
This feature depends on the overall memory reliable feature. When the shared memory reliable feature is enabled, the pages used by the shared memory are allocated from the mirrored region by default. If the mirrored region is insufficient, you can allocate resources from the non-mirrored region.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- include/linux/mem_reliable.h | 10 ++++++++++ mm/mem_reliable.c | 16 ++++++++++++++++ mm/shmem.c | 12 ++++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3fc729aab31a6..8b6f7071072b7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1973,13 +1973,14 @@ some extension. These two features are alternatives.
reliable_debug= [ARM64] - Format: [F] + Format: [F][,S] Only works with CONFIG_MEMORY_RELIABLE and "kernelcore=reliable" is configured. F: User tasks with PF_RELIABLE will not allocate memory from non-mirrored region if this allocation from mirrored region failed. Pagecache and tmpfs will follow this rule too. + S: The shmem does not use the reliable memory.
kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index c9c4d94a4df46..0641c7a88c786 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -17,10 +17,12 @@ extern bool reliable_enabled; extern atomic_long_t reliable_user_used_nr_page; extern unsigned long task_reliable_limit __read_mostly; extern bool reliable_allow_fallback; +extern bool shmem_reliable;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn); +extern void shmem_reliable_init(void); extern void reliable_report_meminfo(struct seq_file *m); extern bool page_reliable(struct page *page); extern void reliable_report_usage(struct seq_file *m, struct mm_struct *mm); @@ -75,6 +77,12 @@ static inline bool reliable_allow_fb_enabled(void) { return reliable_allow_fallback; } + +static inline bool shmem_reliable_is_enabled(void) +{ + return shmem_reliable; +} + #else #define reliable_enabled 0 #define reliable_allow_fb_enabled() false @@ -83,6 +91,7 @@ static inline bool mem_reliable_is_enabled(void) { return false; } static inline void add_reliable_mem_size(long sz) {} static inline void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) {} +static inline void shmem_reliable_init(void) {} static inline bool zone_reliable(struct zone *zone) { return false; } static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { @@ -104,6 +113,7 @@ static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask) {} +static inline bool shmem_reliable_is_enabled(void) { return false; }
#endif
diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 60a214e3b28f7..32a0270b494d2 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -18,6 +18,7 @@ atomic_long_t reliable_user_used_nr_page; /* reliable user limit for user tasks with reliable flag */ unsigned long task_reliable_limit = ULONG_MAX; bool reliable_allow_fallback __read_mostly = true; +bool shmem_reliable __read_mostly = true;
void add_reliable_mem_size(long sz) { @@ -88,6 +89,17 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) atomic_long_read(&total_reliable_mem)); }
+void shmem_reliable_init(void) +{ + if (!shmem_reliable_is_enabled()) + return; + + if (!mem_reliable_is_enabled()) { + shmem_reliable = false; + pr_info("shmem reliable disabled.\n"); + } +} + static unsigned long total_reliable_mem_sz(void) { return atomic_long_read(&total_reliable_mem); @@ -223,6 +235,10 @@ static int __init setup_reliable_debug(char *str) reliable_allow_fallback = false; pr_info("fallback disabled."); break; + case 'S': + shmem_reliable = false; + pr_info("shmem reliable disabled."); + break; default: pr_err("reliable_debug option '%c' unknown. skipped\n", *str); diff --git a/mm/shmem.c b/mm/shmem.c index 16bb7806a25e6..e27fc90bab412 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1593,6 +1593,14 @@ static struct page *shmem_alloc_page(gfp_t gfp, return page; }
+static inline void shmem_prepare_alloc(gfp_t *gfp_mask) +{ + if (!shmem_reliable_is_enabled()) + return; + + *gfp_mask |= ___GFP_RELIABILITY; +} + static struct page *shmem_alloc_and_acct_page(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge, int node_id) @@ -1609,6 +1617,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, if (!shmem_inode_acct_block(inode, nr)) goto failed;
+ shmem_prepare_alloc(&gfp); + if (huge) page = shmem_alloc_hugepage(gfp, info, index, node_id); else @@ -3941,6 +3951,8 @@ int __init shmem_init(void) else shmem_huge = 0; /* just in case it was patched */ #endif + + shmem_reliable_init(); return 0;
out1:
From: Zhou Guanghui zhouguanghui1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
------------------------------------------
Add ReliableShmem in /proc/meminfo to show reliable memory info used by shmem.
- ReliableShmem: reliable memory used by shmem
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/filesystems/proc.txt | 2 ++ include/linux/mem_reliable.h | 11 +++++++++++ mm/mem_reliable.c | 10 ++++++++++ mm/shmem.c | 5 +++++ 4 files changed, 28 insertions(+)
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 1ef781f33b376..78c76d24f9f7d 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -884,6 +884,7 @@ ShmemHugePages: 0 kB ShmemPmdMapped: 0 kB ReliableTotal: 7340032 kB ReliableUsed: 418824 kB +ReliableShmem: 96 kB
MemTotal: Total usable ram (i.e. physical ram minus a few reserved @@ -976,6 +977,7 @@ VmallocChunk: largest contiguous block of vmalloc area which is free allocations. This stat excludes the cost of metadata. ReliableTotal: Total reliable memory size ReliableUsed: The used amount of reliable memory +ReliableShmem: Reliable memory used by shmem
..............................................................................
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 0641c7a88c786..4b51dfc513fc4 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -7,6 +7,7 @@ #include <linux/mmzone.h> #include <linux/mm_types.h> #include <linux/sched.h> +#include <linux/percpu_counter.h>
#ifdef CONFIG_MEMORY_RELIABLE @@ -18,6 +19,7 @@ extern atomic_long_t reliable_user_used_nr_page; extern unsigned long task_reliable_limit __read_mostly; extern bool reliable_allow_fallback; extern bool shmem_reliable; +extern struct percpu_counter reliable_shmem_used_nr_page;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, @@ -83,6 +85,12 @@ static inline bool shmem_reliable_is_enabled(void) return shmem_reliable; }
+static inline void shmem_reliable_page_counter(struct page *page, int nr_page) +{ + if (shmem_reliable_is_enabled() && page_reliable(page)) + percpu_counter_add(&reliable_shmem_used_nr_page, nr_page); +} + #else #define reliable_enabled 0 #define reliable_allow_fb_enabled() false @@ -114,6 +122,9 @@ static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, int preferred_nid, nodemask_t *nodemask) {} static inline bool shmem_reliable_is_enabled(void) { return false; } +static inline void shmem_reliable_page_counter(struct page *page, int nr_page) +{ +}
#endif
diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 32a0270b494d2..89164bc5728b9 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -19,6 +19,7 @@ atomic_long_t reliable_user_used_nr_page; unsigned long task_reliable_limit = ULONG_MAX; bool reliable_allow_fallback __read_mostly = true; bool shmem_reliable __read_mostly = true; +struct percpu_counter reliable_shmem_used_nr_page __read_mostly;
void add_reliable_mem_size(long sz) { @@ -97,7 +98,10 @@ void shmem_reliable_init(void) if (!mem_reliable_is_enabled()) { shmem_reliable = false; pr_info("shmem reliable disabled.\n"); + return; } + + percpu_counter_init(&reliable_shmem_used_nr_page, 0, GFP_KERNEL); }
static unsigned long total_reliable_mem_sz(void) @@ -124,6 +128,12 @@ void reliable_report_meminfo(struct seq_file *m) total_reliable_mem_sz() >> 10); seq_printf(m, "ReliableUsed: %8lu kB\n", used_reliable_mem_sz() >> 10); + + if (shmem_reliable_is_enabled()) { + unsigned long shmem = (unsigned long)percpu_counter_sum( + &reliable_shmem_used_nr_page) << (PAGE_SHIFT - 10); + seq_printf(m, "ReliableShmem: %8lu kB\n", shmem); + } } }
diff --git a/mm/shmem.c b/mm/shmem.c index e27fc90bab412..aabf0dc626da5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -957,6 +957,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, truncate_inode_page(mapping, page); } } + shmem_reliable_page_counter( + page, -(1 << compound_order(page))); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -1067,6 +1069,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, break; } } + shmem_reliable_page_counter( + page, -(1 << compound_order(page))); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -1962,6 +1966,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); + shmem_reliable_page_counter(page, 1 << compound_order(page)); alloced = true;
if (PageTransHuge(page) &&
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
__page_cache_alloc is used to alloc page cache in most file system, such as ext4, f2fs, so add ___GFP_RELIABILITY flag to support feature CONFIG_MEMORY_RELIABLE when alloc page.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/pagemap.h | 1 + mm/filemap.c | 1 + 2 files changed, 2 insertions(+)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 085aed892ce58..a6457acd7462e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -249,6 +249,7 @@ extern struct page *__page_cache_alloc(gfp_t gfp); #else static inline struct page *__page_cache_alloc(gfp_t gfp) { + gfp |= ___GFP_RELIABILITY; return alloc_pages(gfp, 0); } #endif diff --git a/mm/filemap.c b/mm/filemap.c index f578d4e3e2c86..c30e5c1eb77c2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1042,6 +1042,7 @@ struct page *__page_cache_alloc(gfp_t gfp) int n; struct page *page;
+ gfp |= ___GFP_RELIABILITY; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do {
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Add cmdline for the reliable memory usage of page cache. Page cache will not use reliable memory when passing option "P" to reliable_debug in cmdline.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- include/linux/mem_reliable.h | 8 ++++++++ include/linux/pagemap.h | 4 +++- mm/filemap.c | 6 +++++- mm/mem_reliable.c | 5 +++++ 5 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8b6f7071072b7..a46b2fe191ba7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1973,7 +1973,7 @@ some extension. These two features are alternatives.
reliable_debug= [ARM64] - Format: [F][,S] + Format: [F][,S][,P] Only works with CONFIG_MEMORY_RELIABLE and "kernelcore=reliable" is configured. F: User tasks with PF_RELIABLE will not allocate @@ -1981,6 +1981,7 @@ from mirrored region failed. Pagecache and tmpfs will follow this rule too. S: The shmem does not use the reliable memory. + P: Page cache does not use the reliable memory.
kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 4b51dfc513fc4..0c5f80428e973 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -20,6 +20,7 @@ extern unsigned long task_reliable_limit __read_mostly; extern bool reliable_allow_fallback; extern bool shmem_reliable; extern struct percpu_counter reliable_shmem_used_nr_page; +extern bool pagecache_use_reliable_mem;
extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, @@ -85,6 +86,11 @@ static inline bool shmem_reliable_is_enabled(void) return shmem_reliable; }
+static inline bool pagecache_reliable_is_enabled(void) +{ + return pagecache_use_reliable_mem; +} + static inline void shmem_reliable_page_counter(struct page *page, int nr_page) { if (shmem_reliable_is_enabled() && page_reliable(page)) @@ -94,6 +100,7 @@ static inline void shmem_reliable_page_counter(struct page *page, int nr_page) #else #define reliable_enabled 0 #define reliable_allow_fb_enabled() false +#define pagecache_use_reliable_mem 0
static inline bool mem_reliable_is_enabled(void) { return false; } static inline void add_reliable_mem_size(long sz) {} @@ -126,6 +133,7 @@ static inline void shmem_reliable_page_counter(struct page *page, int nr_page) { }
+static inline bool pagecache_reliable_is_enabled(void) { return false; } #endif
#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a6457acd7462e..77563c03618c9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -249,7 +249,9 @@ extern struct page *__page_cache_alloc(gfp_t gfp); #else static inline struct page *__page_cache_alloc(gfp_t gfp) { - gfp |= ___GFP_RELIABILITY; + if (pagecache_reliable_is_enabled()) + gfp |= ___GFP_RELIABILITY; + return alloc_pages(gfp, 0); } #endif diff --git a/mm/filemap.c b/mm/filemap.c index c30e5c1eb77c2..4dc3cc5834a55 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1042,7 +1042,11 @@ struct page *__page_cache_alloc(gfp_t gfp) int n; struct page *page;
- gfp |= ___GFP_RELIABILITY; + if (pagecache_reliable_is_enabled()) + gfp |= ___GFP_RELIABILITY; + else + WARN_ON_ONCE(gfp & ___GFP_RELIABILITY); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 89164bc5728b9..5a32977b674fd 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -21,6 +21,7 @@ bool reliable_allow_fallback __read_mostly = true; bool shmem_reliable __read_mostly = true; struct percpu_counter reliable_shmem_used_nr_page __read_mostly;
+bool pagecache_use_reliable_mem __read_mostly = true; void add_reliable_mem_size(long sz) { atomic_long_add(sz, &total_reliable_mem); @@ -249,6 +250,10 @@ static int __init setup_reliable_debug(char *str) shmem_reliable = false; pr_info("shmem reliable disabled."); break; + case 'P': + pagecache_use_reliable_mem = false; + pr_info("disable page cache use reliable memory\n"); + break; default: pr_err("reliable_debug option '%c' unknown. skipped\n", *str);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
-------------------------------- Item "FileCache" in /proc/meminfo show the number of page cache in LRU(active + inactive).
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/mem_reliable.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 796892d5d0560..d4458d7401945 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -165,6 +165,17 @@ void reliable_report_meminfo(struct seq_file *m) &reliable_shmem_used_nr_page) << (PAGE_SHIFT - 10); seq_printf(m, "ReliableShmem: %8lu kB\n", shmem); } + + if (pagecache_reliable_is_enabled()) { + unsigned long num = 0; + + num += global_node_page_state(NR_LRU_BASE + + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + + LRU_INACTIVE_FILE); + seq_printf(m, "FileCache: %8lu kB\n", + num << (PAGE_SHIFT - 10)); + } } }
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
If vmcache_reclaim_s > 120, it will call shrink_page_cache_work after 120 seconds even shrinking is hard, that is shorter than vmcache_reclaim_s, deviating from the original intention of extending the interval.
In order to solve this, shrink_page_cache_work should be call after vmcache_reclaim_s + 120.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 67f72f4d9daef..208ee8653da62 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4053,7 +4053,7 @@ static void shrink_page_cache_work(struct work_struct *w) nr_pages = shrink_page_cache(GFP_KERNEL); if ((nr_pages < SWAP_CLUSTER_MAX) && vm_cache_reclaim_enable) queue_delayed_work_on(smp_processor_id(), system_wq, work, - round_jiffies_relative(120 * HZ)); + round_jiffies_relative((vm_cache_reclaim_s + 120) * HZ)); }
static void shrink_page_cache_init(void)
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
-------------------------------------------------------
In function cache_limit_ratio_sysctl_handler and cache_limit_mbytes_sysctl_handler, it will shrink page cache even if vm_cache_reclaim_enable is false, it is unexpected.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/page_alloc.c | 6 ++++-- mm/vmscan.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 455525e49c727..3fb21ea5dcf9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8600,7 +8600,8 @@ int cache_limit_ratio_sysctl_handler(struct ctl_table *table, int write, vm_cache_limit_ratio); else pr_warn("page cache limit off\n"); - while (vm_cache_limit_mbytes && page_cache_over_limit()) + while (vm_cache_limit_mbytes && page_cache_over_limit() && + vm_cache_reclaim_enable) shrink_page_cache(GFP_KERNEL); }
@@ -8628,7 +8629,8 @@ int cache_limit_mbytes_sysctl_handler(struct ctl_table *table, int write, else pr_warn("page cache limit off\n");
- while (vm_cache_limit_mbytes && page_cache_over_limit()) + while (vm_cache_limit_mbytes && page_cache_over_limit() && + vm_cache_reclaim_enable) shrink_page_cache(GFP_KERNEL); }
diff --git a/mm/vmscan.c b/mm/vmscan.c index 208ee8653da62..39b06e6eaa0f0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3562,7 +3562,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) kernel_swap_check(&sc);
#ifdef CONFIG_SHRINK_PAGECACHE - if (vm_cache_limit_mbytes && page_cache_over_limit()) + if (vm_cache_limit_mbytes && page_cache_over_limit() && + vm_cache_reclaim_enable) shrink_page_cache(GFP_KERNEL); #endif
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
The reason of disable shrink_page_cache in add_to_page_cache are:
1. Synchronous memory reclamation will affect performance. 2. add_to_page_cache will not increase the number of LRU size in HugeTLB situation, so shrink_page_cache will not be triggered.
Now, add_to_page_cache in mm/filemap.c and include/linux/pagemap.h are same, don't delete add_to_page_cache in mm/filemap.c, just keep interface for KABI.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/filemap.c | 2 -- 1 file changed, 2 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c index 320c97244e163..2827e2b670e02 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -991,8 +991,6 @@ int add_to_page_cache(struct page *page, { int error;
- if (vm_cache_limit_mbytes && page_cache_over_limit()) - shrink_page_cache(gfp_mask); __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error))
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
The number of page cache should be limited in a range if enable CONFIG_MEMORY_RELIABLE, so only page cache instead of both file + anono page should be reclaimed during page cache reclaimtion.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 39b06e6eaa0f0..a9fca07dba3ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@
#include <linux/swapops.h> #include <linux/balloon_compaction.h> +#include <linux/mem_reliable.h>
#include "internal.h"
@@ -3968,7 +3969,7 @@ static unsigned long __shrink_page_cache(gfp_t mask) .nr_to_reclaim = SWAP_CLUSTER_MAX * (unsigned long)vm_cache_reclaim_weight, .may_unmap = 1, - .may_swap = 1, + .may_swap = mem_reliable_is_enabled() ? 0 : 1, .order = 0, .priority = DEF_PRIORITY, .target_mem_cgroup = NULL,
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
If page cache is over limit, it will trigger page cache reclaimation, only page cache should be reclaimed, but slab will be reclaimed by default in shrink_node, so disable shrink_slab by adding a control parameter in scan_control.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmscan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index a9fca07dba3ba..0623580de1f9b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -101,6 +101,9 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1;
+ /* can't shrink slab pages */ + unsigned int no_shrink_slab:1; + /* Allocation order */ s8 order;
@@ -2783,8 +2786,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages;
- shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->priority); + if (!sc->no_shrink_slab) + shrink_slab(sc->gfp_mask, pgdat->node_id, + memcg, sc->priority);
/* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -3970,6 +3974,7 @@ static unsigned long __shrink_page_cache(gfp_t mask) (unsigned long)vm_cache_reclaim_weight, .may_unmap = 1, .may_swap = mem_reliable_is_enabled() ? 0 : 1, + .no_shrink_slab = mem_reliable_is_enabled() ? 0 : 1, .order = 0, .priority = DEF_PRIORITY, .target_mem_cgroup = NULL,
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
-------------------------------- Add two proc interfaces to set page cache limit. Both vm_cache_limit_mbytes and vm_cache_limit_ratio will be update when writing either of the two interfaces.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sysctl.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 31c35a65fbb78..542611081c610 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1468,6 +1468,24 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "cache_limit_mbytes", + .data = &vm_cache_limit_mbytes, + .maxlen = sizeof(vm_cache_limit_mbytes), + .mode = 0644, + .proc_handler = cache_limit_mbytes_sysctl_handler, + .extra1 = &vm_cache_limit_mbytes_min, + .extra2 = &vm_cache_limit_mbytes_max, + }, + { + .procname = "cache_limit_ratio", + .data = &vm_cache_limit_ratio, + .maxlen = sizeof(vm_cache_limit_ratio), + .mode = 0644, + .proc_handler = cache_limit_ratio_sysctl_handler, + .extra1 = &vm_cache_limit_ratio_min, + .extra2 = &vm_cache_limit_ratio_max, + }, #endif #ifdef CONFIG_HUGETLB_PAGE {
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Function shrink_shepherd is used to queue each work on cpu to shrink page cache, and it will be called periodically, but if there is no page_cache_over_limit check before shrink page cache, it will result in periodic memory reclamation even the number of page cache below limit, so add basic check before shrink page cache.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmscan.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 0623580de1f9b..efe572fd090c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4056,6 +4056,10 @@ static void shrink_page_cache_work(struct work_struct *w) if (vm_cache_reclaim_s == 0 || !vm_cache_reclaim_enable) return;
+ if (mem_reliable_is_enabled() && + (!vm_cache_limit_mbytes || !page_cache_over_limit())) + return; + /* It should wait more time if we hardly reclaim the page cache */ nr_pages = shrink_page_cache(GFP_KERNEL); if ((nr_pages < SWAP_CLUSTER_MAX) && vm_cache_reclaim_enable)
From: Alexander Duyck alexander.h.duyck@linux.intel.com
stable inclusion from stable-5.10.88 commit 8204e0c1113d6b7f599bcd7ebfbfde72e76c102f bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S
--------------------------------
Provide a new function, queue_work_node, which is meant to schedule work on a "random" CPU of the requested NUMA node. The main motivation for this is to help assist asynchronous init to better improve boot times for devices that are local to a specific node.
For now we just default to the first CPU that is in the intersection of the cpumask of the node and the online cpumask. The only exception is if the CPU is local to the node we will just use the current CPU. This should work for our purposes as we are currently only using this for unbound work so the CPU will be translated to a node anyway instead of being directly used.
As we are only using the first CPU to represent the NUMA node for now I am limiting the scope of the function so that it can only be used with unbound workqueues.
Acked-by: Tejun Heo tj@kernel.org Reviewed-by: Bart Van Assche bvanassche@acm.org Acked-by: Dan Williams dan.j.williams@intel.com Signed-off-by: Alexander Duyck alexander.h.duyck@linux.intel.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yu Liao liaoyu15@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/workqueue.h | 2 + kernel/workqueue.c | 84 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 93b87cd67ee4a..6f2b042fc44cd 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -481,6 +481,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); +extern bool queue_work_node(int node, struct workqueue_struct *wq, + struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a46532be3bcb0..04b558a267aea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1490,6 +1490,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on);
+/** + * workqueue_select_cpu_near - Select a CPU based on NUMA node + * @node: NUMA node ID that we want to select a CPU from + * + * This function will attempt to find a "random" cpu available on a given + * node. If there are no CPUs available on the given node it will return + * WORK_CPU_UNBOUND indicating that we should just schedule to any + * available CPU if we need to schedule this work. + */ +static int workqueue_select_cpu_near(int node) +{ + int cpu; + + /* No point in doing this if NUMA isn't enabled for workqueues */ + if (!wq_numa_enabled) + return WORK_CPU_UNBOUND; + + /* Delay binding to CPU if node is not valid or online */ + if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) + return WORK_CPU_UNBOUND; + + /* Use local node/cpu if we are already there */ + cpu = raw_smp_processor_id(); + if (node == cpu_to_node(cpu)) + return cpu; + + /* Use "random" otherwise know as "first" online CPU of node */ + cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); + + /* If CPU is valid return that, otherwise just defer */ + return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; +} + +/** + * queue_work_node - queue work on a "random" cpu for a given NUMA node + * @node: NUMA node that we are targeting the work for + * @wq: workqueue to use + * @work: work to queue + * + * We queue the work to a "random" CPU within a given NUMA node. The basic + * idea here is to provide a way to somehow associate work with a given + * NUMA node. + * + * This function will only make a best effort attempt at getting this onto + * the right NUMA node. If no node is requested or the requested node is + * offline then we just fall back to standard queue_work behavior. + * + * Currently the "random" CPU ends up being the first available CPU in the + * intersection of cpu_online_mask and the cpumask of the node, unless we + * are running on the node. In that case we just use the current CPU. + * + * Return: %false if @work was already on a queue, %true otherwise. + */ +bool queue_work_node(int node, struct workqueue_struct *wq, + struct work_struct *work) +{ + unsigned long flags; + bool ret = false; + + /* + * This current implementation is specific to unbound workqueues. + * Specifically we only return the first available CPU for a given + * node instead of cycling through individual CPUs within the node. + * + * If this is used with a per-cpu workqueue then the logic in + * workqueue_select_cpu_near would need to be updated to allow for + * some round robin type logic. + */ + WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); + + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + int cpu = workqueue_select_cpu_near(node); + + __queue_work(cpu, wq, work); + ret = true; + } + + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_node); + void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = from_timer(dwork, t, timer);
From: Yu Liao liaoyu15@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
This patch add sysctl to clear pages in free lists of each NUMA node. For each NUMA node, clear each page in the free list, these work is scheduled on a random CPU of the NUMA node.
When kasan is enabled and the pages are free, the shadow memory will be filled with 0xFF, writing these free pages will cause UAF, so just disable KASAN for clear freelist.
Signed-off-by: Yu Liao liaoyu15@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/sysctl/vm.txt | 13 +++ mm/Kconfig | 7 ++ mm/Makefile | 2 + mm/clear_freelist_page.c | 163 ++++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+) create mode 100644 mm/clear_freelist_page.c
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 7d73882e2c273..8d824892d00d6 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -20,6 +20,7 @@ Currently, these files are in /proc/sys/vm:
- admin_reserve_kbytes - block_dump +- clear_freelist_pages - compact_memory - compact_unevictable_allowed - dirty_background_bytes @@ -104,6 +105,18 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
==============================================================
+clear_freelist_pages + +Available only when CONFIG_CLEAR_FREELIST_PAGE is set. When 1 is written to the +file, all pages in free lists will be written with 0. + +Zone lock is held during clear_freelist_pages, if the execution time is too +long, RCU CPU Stall warnings will be print. For each NUMA node, +clear_freelist_pages is performed on a "random" CPU of the NUMA node. +The time consuming is related to the hardware. + +============================================================== + compact_memory
Available only when CONFIG_COMPACTION is set. When 1 is written to the file, diff --git a/mm/Kconfig b/mm/Kconfig index 80d7b47ca9f53..3a38eb4a6f020 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -849,4 +849,11 @@ config MEMORY_RELIABLE To enable this function, mirrored memory is needed and "kernelcore=reliable" need to be added in kernel parameters.
+config CLEAR_FREELIST_PAGE + bool "Support for clear free list pages" + depends on MMU && SYSCTL + default n + help + Say y here to enable the clear free list pages feature. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 741f9c250914c..38291476ce222 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -6,6 +6,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n +KASAN_SANITIZE_clear_freelist_page.o := n
# These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of @@ -110,3 +111,4 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o +obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c new file mode 100644 index 0000000000000..69975f458dc79 --- /dev/null +++ b/mm/clear_freelist_page.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for clear free list pages. + */ + +#include <linux/mmzone.h> +#include <linux/mm_types.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/nmi.h> +#include <linux/module.h> + +#define for_each_populated_zone_pgdat(pgdat, zone) \ + for (zone = pgdat->node_zones; \ + zone; \ + zone = next_pgdat_zone(zone)) \ + if (!populated_zone(zone)) \ + ; /* do nothing */ \ + else + +struct pgdat_entry { + struct pglist_data *pgdat; + struct work_struct work; +}; + +static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait); +static DEFINE_MUTEX(clear_freelist_lock); +static atomic_t clear_freelist_workers; +static atomic_t clear_pages_num; +static int one = 1; + +/* + * next_pgdat_zone - helper magic for for_each_populated_zone_pgdat() + */ +static struct zone *next_pgdat_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else + zone = NULL; + return zone; +} + +static void clear_pgdat_freelist_pages(struct work_struct *work) +{ + struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work); + struct pglist_data *pgdat = entry->pgdat; + unsigned long flags, order, t; + struct page *page; + struct zone *zone; + + for_each_populated_zone_pgdat(pgdat, zone) { + spin_lock_irqsave(&zone->lock, flags); + for_each_migratetype_order(order, t) { + list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) { +#ifdef CONFIG_KMAP_LOCAL + int i; + + /* Clear highmem by clear_highpage() */ + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +#else + memset(page_address(page), 0, (1 << order) * PAGE_SIZE); +#endif + touch_nmi_watchdog(); + atomic_add(1 << order, &clear_pages_num); + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + cond_resched(); + } + kfree(entry); + + if (atomic_dec_and_test(&clear_freelist_workers)) + wake_up(&clear_freelist_wait); +} + +static void init_clear_freelist_work(struct pglist_data *pgdat) +{ + struct pgdat_entry *entry; + + entry = kzalloc(sizeof(struct pgdat_entry), GFP_KERNEL); + if (!entry) + return; + + entry->pgdat = pgdat; + INIT_WORK(&entry->work, clear_pgdat_freelist_pages); + queue_work_node(pgdat->node_id, system_unbound_wq, &entry->work); +} + +static void clear_freelist_pages(void) +{ + struct pglist_data *pgdat; + + mutex_lock(&clear_freelist_lock); + drain_all_pages(NULL); + + for_each_online_pgdat(pgdat) { + atomic_inc(&clear_freelist_workers); + init_clear_freelist_work(pgdat); + } + + wait_event(clear_freelist_wait, atomic_read(&clear_freelist_workers) == 0); + + pr_debug("Cleared pages %d\nFree pages %lu\n", atomic_read(&clear_pages_num), + global_zone_page_state(NR_FREE_PAGES)); + atomic_set(&clear_pages_num, 0); + + mutex_unlock(&clear_freelist_lock); +} + +static int sysctl_clear_freelist_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int val; + + table->data = &val; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) + clear_freelist_pages(); + + return ret; +} + +static struct ctl_table clear_freelist_table[] = { + { + .procname = "clear_freelist_pages", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &sysctl_clear_freelist_handler, + .extra1 = &one, + .extra2 = &one, + }, + { } +}; + +static struct ctl_table sys_ctl_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = clear_freelist_table, + }, + { } +}; + +static int __init clear_freelist_init(void) +{ + register_sysctl_table(sys_ctl_table); + + return 0; +} +module_init(clear_freelist_init);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
--------------------------------
Enable CONFIG_MEMORY_RELIABLE CONFIG_CLEAR_FREELIST_PAGE and CONFIG_EFI_FAKE_MEMMAP for test
Reviewed-by: Chen Wandun chenwandun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index f3cf87b98f3ef..f3a15e856b5f7 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -608,6 +608,7 @@ CONFIG_EFI_VARS=y CONFIG_EFI_ESRT=y CONFIG_EFI_VARS_PSTORE=y CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y +CONFIG_EFI_FAKE_MEMMAP=y CONFIG_EFI_PARAMS_FROM_FDT=y CONFIG_EFI_RUNTIME_WRAPPERS=y CONFIG_EFI_ARMSTUB=y @@ -1003,6 +1004,8 @@ CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y CONFIG_ARCH_HAS_PTE_SPECIAL=y CONFIG_PIN_MEMORY=y CONFIG_PID_RESERVE=y +CONFIG_MEMORY_RELIABLE=y +CONFIG_CLEAR_FREELIST_PAGE=y CONFIG_NET=y CONFIG_NET_INGRESS=y CONFIG_NET_EGRESS=y