euleros inclusion category: feature feature: Add pages to swapcache and swap them out proactively bugzilla: NA CVE: NA
-----------------------------------------------------
Each CPU socket can have 1 DRAM and 1 PMEM node, we call them "peer nodes". Migration between DRAM and PMEM will by default happen between peer nodes.
It's a temp solution. In multiple memory layers, a node can have both promotion and demotion targets instead of a single peer node. User space may also be able to infer promotion/demotion targets based on future HMAT info.
Signed-off-by: Fan Du fan.du@intel.com Signed-off-by: Fengguang Wu fengguang.wu@intel.com Signed-off-by: Shijie Luo luoshijie1@huawei.com Signed-off-by: liubo liubo254@huawei.com Reviewed-by: louhongxiang louhongxiang@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- drivers/base/node.c | 7 +++++++ include/linux/etmem.h | 1 + include/linux/mmzone.h | 14 ++++++++++++++ mm/etmem.c | 28 ++++++++++++++++++++++++++++ mm/mm_init.c | 4 ++++ 5 files changed, 54 insertions(+)
diff --git a/drivers/base/node.c b/drivers/base/node.c index 493d533f8375..f4a7d5590c0c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -569,11 +569,18 @@ static ssize_t node_read_distance(struct device *dev, } static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
+#ifdef CONFIG_ETMEM +static DEVICE_ATTR_RO(peer_node); +#endif + static struct attribute *node_dev_attrs[] = { &dev_attr_meminfo.attr, &dev_attr_numastat.attr, &dev_attr_distance.attr, &dev_attr_vmstat.attr, +#ifdef CONFIG_ETMEM + &dev_attr_peer_node.attr, +#endif NULL };
diff --git a/include/linux/etmem.h b/include/linux/etmem.h index 5ebd1c3274b7..fc19cdc0d567 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -9,6 +9,7 @@ #include <linux/page-flags.h>
#ifdef CONFIG_ETMEM +int find_best_peer_node(int nid);
#if IS_ENABLED(CONFIG_KVM) static inline struct kvm *mm_kvm(struct mm_struct *mm) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 57086c57b8e4..c5c9b9c60ea5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1410,6 +1410,20 @@ typedef struct pglist_data {
CACHELINE_PADDING(_pad2_);
+#ifdef CONFIG_ETMEM + /* + * Points to the nearest node in terms of latency + * E.g. peer of node 0 is node 2 per SLIT + * node distances: + * node 0 1 2 3 + * 0: 10 21 17 28 + * 1: 21 10 28 17 + * 2: 17 28 10 28 + * 3: 28 17 28 10 + */ + int peer_node; + int notused; +#endif /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; diff --git a/mm/etmem.c b/mm/etmem.c index 9a89bfcc1058..fe5ca89a4e76 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -9,6 +9,34 @@ #include <linux/etmem.h> #include "internal.h"
+/* + * Return the nearest peer node in terms of *locality* + * E.g. peer of node 0 is node 2 per SLIT + * node distances: + * node 0 1 2 3 + * 0: 10 21 17 28 + * 1: 21 10 28 17 + * 2: 17 28 10 28 + * 3: 28 17 28 10 + */ +int find_best_peer_node(int nid) +{ + int n, val; + int min_val = INT_MAX; + int peer = NUMA_NO_NODE; + + for_each_online_node(n) { + if (n == nid) + continue; + val = node_distance(nid, n); + if (val < min_val) { + min_val = val; + peer = n; + } + } + return peer; +} + int add_page_for_swap(struct page *page, struct list_head *pagelist) { int err = -EBUSY; diff --git a/mm/mm_init.c b/mm/mm_init.c index fed4370b02e1..1a4efa3e76cd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -29,6 +29,7 @@ #include "internal.h" #include "slab.h" #include "shuffle.h" +#include <linux/etmem.h>
#include <asm/setup.h>
@@ -1901,6 +1902,9 @@ void __init free_area_init(unsigned long *max_zone_pfn) if (pgdat->node_present_pages) node_set_state(nid, N_MEMORY); check_for_memory(pgdat); +#ifdef CONFIG_ETMEM + pgdat->peer_node = find_best_peer_node(nid); +#endif }
memmap_init();