
hulk inclusion category:feature bugzilla:https://gitee.com/openeuler/kernel/issues/IC8KS8 CVE: NA -------------------------------- This patch adds a demotion mechanism to DAMON VA that enables the migration of cold (inactive) memory pages to a specified nodemask. In heterogeneous memory systems with multi-tier memory (e.g. DRAM, CXL, SSDs), the nodemask can target lower-performance or capacity-tier nodes such as CXL memory expanders or software-defined NUMA nodes. By allowing cold memory to be demoted to these designated nodes, this feature improves memory hierarchy utilization, enhances overall system performance, and helps reduce memory pressure on hot tiers. The interface offers finer-grained control over memory placement and is especially useful in memory tiering and aging scenarios. Signed-off-by: Ze Zuo <zuoze1@huawei.com> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com> --- include/linux/damon.h | 4 + include/linux/mempolicy.h | 2 + include/linux/migrate_mode.h | 1 + include/trace/events/migrate.h | 3 +- mm/damon/sysfs-schemes.c | 40 +++++++++ mm/damon/vaddr.c | 40 +++++++++ mm/mempolicy.c | 146 +++++++++++++++++++++++++++++++++ 7 files changed, 235 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 424431d5a100..e544de649dc3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -93,6 +93,7 @@ struct damon_target { * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_DEMOTION: Migrate cold page areas to specific nodes. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * @@ -110,6 +111,7 @@ enum damos_action { DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, + DAMOS_DEMOTION, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; @@ -302,6 +304,7 @@ struct damos_access_pattern { * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @pattern: Access pattern of target regions. * @action: &damo_action to be applied to the target regions. + * @remote_node: The NUMA node ID from which the cold page will be moved. * @apply_interval_us: The time between applying the @action. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. @@ -334,6 +337,7 @@ struct damos_access_pattern { struct damos { struct damos_access_pattern pattern; enum damos_action action; + nodemask_t remote_node; unsigned long apply_interval_us; /* private: internal use only */ /* diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 2e81ac87e6f6..29cc0d842a8f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -167,6 +167,8 @@ static inline void check_highest_zone(enum zone_type k) int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); +int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, unsigned long start, unsigned long end, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index f37cc03f9369..302c659dc626 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -29,6 +29,7 @@ enum migrate_reason { MR_CONTIG_RANGE, MR_LONGTERM_PIN, MR_DEMOTION, + MR_DAMON_DEMOTION, MR_TYPES }; diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 0190ef725b43..bafe4208de73 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -22,7 +22,8 @@ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ EM( MR_CONTIG_RANGE, "contig_range") \ EM( MR_LONGTERM_PIN, "longterm_pin") \ - EMe(MR_DEMOTION, "demotion") + EM(MR_DEMOTION, "demotion") \ + EMe(MR_DAMON_DEMOTION, "damon_demotion") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 26c948f87489..dc570e90abca 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1123,6 +1123,7 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = { struct damon_sysfs_scheme { struct kobject kobj; enum damos_action action; + nodemask_t remote_node; struct damon_sysfs_access_pattern *access_pattern; struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; @@ -1140,6 +1141,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { "nohugepage", "lru_prio", "lru_deprio", + "demotion", "stat", }; @@ -1153,6 +1155,7 @@ static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( return NULL; scheme->kobj = (struct kobject){}; scheme->action = action; + scheme->remote_node = NODE_MASK_ALL; return scheme; } @@ -1356,6 +1359,36 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, return -EINVAL; } +static ssize_t remote_node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%*pbl\n", + nodemask_pr_args(&scheme->remote_node)); +} + +static ssize_t remote_node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + int ret; + nodemask_t new_mask; + + ret = nodelist_parse(buf, new_mask); + if (ret < 0) + return -EINVAL; + + if (!nodes_subset(new_mask, node_states[N_MEMORY])) + return -EINVAL; + + nodes_and(scheme->remote_node, new_mask, node_states[N_MEMORY]); + return count; +} + + static void damon_sysfs_scheme_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); @@ -1364,8 +1397,12 @@ static void damon_sysfs_scheme_release(struct kobject *kobj) static struct kobj_attribute damon_sysfs_scheme_action_attr = __ATTR_RW_MODE(action, 0600); +static struct kobj_attribute damon_sysfs_scheme_remote_node_attr = + __ATTR_RW_MODE(remote_node, 0600); + static struct attribute *damon_sysfs_scheme_attrs[] = { &damon_sysfs_scheme_action_attr.attr, + &damon_sysfs_scheme_remote_node_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); @@ -1644,6 +1681,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->pattern.max_age_region = access_pattern->age->max; scheme->action = sysfs_scheme->action; + scheme->remote_node = sysfs_scheme->remote_node; scheme->quota.ms = sysfs_quotas->ms; scheme->quota.sz = sysfs_quotas->sz; @@ -1687,6 +1725,8 @@ int damon_sysfs_set_schemes(struct damon_ctx *ctx, damon_destroy_scheme(scheme); return -ENOMEM; } + + scheme->remote_node = sysfs_schemes->schemes_arr[i]->remote_node; damon_add_scheme(ctx, scheme); } return 0; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 167ca23d77ae..3a21410e631e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -14,6 +14,7 @@ #include <linux/page_idle.h> #include <linux/pagewalk.h> #include <linux/sched/mm.h> +#include <linux/cpuset.h> #include "ops-common.h" @@ -479,6 +480,41 @@ static inline void hw_damon_va_prepare_access_checks(struct damon_ctx *ctx) { } static inline unsigned int hw_damon_va_check_accesses(struct damon_ctx *ctx) {return 0; } #endif +#ifdef CONFIG_MIGRATION +static unsigned long damon_migrate_pages(struct damon_target *t, + struct damon_region *r, nodemask_t task_remote_nodes) +{ + struct mm_struct *mm = NULL; + unsigned long applied; + struct task_struct *task; + nodemask_t task_nodes; + + task = damon_get_task_struct(t); + if (!task) + return 0; + task_nodes = cpuset_mems_allowed(task); + put_task_struct(task); + + mm = damon_get_mm(t); + if (!mm) + return 0; + + applied = do_migrate_area_pages(mm, &task_nodes, &task_remote_nodes, + r->ar.start, r->ar.end, MPOL_MF_MOVE_ALL); + + mmput(mm); + + return applied; +} + +#else +static inline unsigned long damon_migrate_pages(struct damon_target *t, + struct damon_region *r, nodemask_t task_remote_nodes) +{ + return 0; +} +#endif /* CONFIG_MIGRATION */ + /* * Functions for the access checking of the regions */ @@ -757,6 +793,8 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_DEMOTION: + return damon_migrate_pages(t, r, scheme->remote_node); case DAMOS_STAT: return 0; default: @@ -777,6 +815,8 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_cold_score(context, r, scheme); + case DAMOS_DEMOTION: + return damon_cold_score(context, r, scheme); default: break; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 219c098b3ffa..88f0bb008efd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1104,6 +1104,46 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return err; } +/* + * Migrate area pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +static int migrate_area_to_node(struct mm_struct *mm, int source, int dest, + unsigned long start, unsigned long end, int flags) +{ + nodemask_t nmask; + struct vm_area_struct *vma; + LIST_HEAD(pagelist); + int err = 0; + struct migration_target_control mtc = { + .nid = dest, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; + + nodes_clear(nmask); + node_set(source, nmask); + + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + vma = find_vma(mm, 0); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + queue_pages_range(mm, start, end, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); + + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_DAMON_DEMOTION, NULL); + if (err) + putback_movable_pages(&pagelist); + } + + return err; +} + + /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. @@ -1209,6 +1249,112 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } +/* + * Move mm area size pages between the two nodesets so as to preserve the physical + * layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, unsigned long start, + unsigned long end, int flags) +{ + int busy = 0; + int err = 0; + nodemask_t tmp; + + lru_cache_disable(); + + mmap_read_lock(mm); + + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that <source, dest> pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent <s, d> pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scanning from_tmp, we at least have the + * most recent <s, d> pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from; + while (!nodes_empty(tmp)) { + int s, d; + int source = NUMA_NO_NODE; + int dest = 0; + + for_each_node_mask(s, tmp) { + + /* + * do_migrate_pages() tries to maintain the relative + * node relationship of the pages established between + * threads and memory areas. + * + * However if the number of source nodes is not equal to + * the number of destination nodes we can not preserve + * this node relative relationship. In that case, skip + * copying memory from a node that is in the destination + * mask. + * + * Example: [2,3,4] -> [3,4,5] moves everything. + * [0-7] - > [3,4,5] moves only 0,1,2,6,7. + */ + + if ((nodes_weight(*from) != nodes_weight(*to)) && + (node_isset(s, *to))) + continue; + + d = node_remap(s, *from, *to); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == NUMA_NO_NODE) + break; + + node_clear(source, tmp); + err = migrate_area_to_node(mm, source, dest, start, end, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } + mmap_read_unlock(mm); + + lru_cache_enable(); + if (err < 0) + return err; + return busy; + +} + /* * Allocate a new page for page migration based on vma policy. * Start by assuming the page is mapped by the same vma as contains @start. -- 2.25.1