[PATCH OLK-6.6 1/6] arm64: mm: hardcode domain info and add get_domain_cpumask()

10 Apr 2026

From: Jinjiang Tu <tujinjiang@huawei.com>

Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
---
 arch/arm64/include/asm/mmu_context.h |   3 +
 arch/arm64/mm/context.c              | 215 +++++++++++++++++++++++++++
 2 files changed, 218 insertions(+)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index a6fb325424e7..4c0e29a86ac2 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -298,6 +298,9 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
 	return -1UL >> 8;
 }
 
+int get_domain_cpumask(int domain, cpumask_t *cpus);
+int trans_cpumask_to_domain(const cpumask_t *active_cpus);
+
 #include <asm-generic/mmu_context.h>
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index b2ac06246327..fd6d093bbdff 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -420,3 +420,218 @@ static int asids_init(void)
 	return 0;
 }
 early_initcall(asids_init);
+
+/*
+ * Several data structs and helpers of translation from cpumask to tlb domain
+ */
+#include <linux/bitmap.h>
+
+#define NR_DOMAINS	(128)
+#define CLUSTER_PER_DIE	(6)
+#define CPU_PER_CLUSTER (8)
+
+struct domain_entry {
+	/* fields passed via ACPI*/
+	int socket0;
+	int socket1;
+	int die0;
+	int die1;
+	unsigned long cluster0_mask;
+	unsigned long cluster1_mask;
+	int domain_id;
+	/* fields initialized via kernel */
+	cpumask_t cpumask;
+};
+static struct domain_entry domain_table[NR_DOMAINS];
+
+struct translation_entry {
+	cpumask_t cpumask;
+	int domain_id;
+};
+static struct translation_entry translation_table[NR_CPUS][NR_DOMAINS];
+
+int trans_cpumask_to_domain(const cpumask_t *active_cpus)
+{
+	struct translation_entry *entry;
+	cpumask_t cpumask;
+	int i, j;
+
+	/*
+	 * Due to the sparse nature of translation_table, the time
+	 * complexity is NR_DOMAINS, not NR_DOMAINS * NR_CPUS indeed.
+	 */
+	for (i = max(CPU_PER_CLUSTER, cpumask_weight(active_cpus)); i < NR_CPUS; i++) {
+		for (j = 0; j < NR_DOMAINS; j++) {
+			entry = &translation_table[i][j];
+			/* the last valid entry */
+			if (entry->domain_id == -1)
+				break;
+
+			if (cpumask_and(&cpumask, active_cpus, &(entry->cpumask)) &&
+			    cpumask_equal(&cpumask, active_cpus))
+				return translation_table[i][j].domain_id;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(trans_cpumask_to_domain);
+
+int get_domain_cpumask(int domain, cpumask_t *cpus)
+{
+	cpumask_copy(cpus, &(domain_table[domain].cpumask));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_domain_cpumask);
+
+void init_domain_entry(int cpu, int domain)
+{
+	int socket = cpu_topology[cpu].package_id;
+	int die = cpu_to_node(cpu);
+	unsigned long cluster = (unsigned long)cpu_topology[cpu].cluster_id;
+	unsigned long map;
+
+	if (domain_table[domain].socket0 == socket) {
+		if (domain_table[domain].die0 == die) {
+			map = (1 << cluster);
+			if (bitmap_and(&map, &map,
+				       &domain_table[domain].cluster0_mask,
+				       CLUSTER_PER_DIE))  {
+				cpumask_set_cpu(cpu,
+						&domain_table[domain].cpumask);
+				return;
+			}
+		}
+	}
+
+	if (domain_table[domain].socket1 == socket) {
+		if (domain_table[domain].die1 == die) {
+			map = (1 << cluster);
+			if (bitmap_and(&map, &map,
+				       &domain_table[domain].cluster1_mask,
+				       CLUSTER_PER_DIE))  {
+				cpumask_set_cpu(cpu,
+						&domain_table[domain].cpumask);
+				return;
+			}
+		}
+	}
+}
+
+int __init called_after_parse_acpi_topology(void)
+{
+	int cpu, domain;
+
+	for (domain = 0; domain < NR_DOMAINS; domain++) {
+		cpumask_clear(&(domain_table[domain].cpumask));
+	}
+	/*
+	 * Initialize the reserved fields of domain_entry for the domain
+	 * translation later.
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		for (domain = 0; domain < NR_DOMAINS; domain++) {
+			init_domain_entry(cpu, domain);
+			translation_table[cpu][domain].domain_id = -1;
+		}
+	}
+
+	/*
+	 * Map each domain_entry into translation_table. The purpose is ensure
+	 * all of domain_entries on the same row of translation_table include
+	 * the same number of CPUs.
+	 * More number of CPUs means higher overhead of 'dmb sy'. So in order
+	 * to reduce the latency spent on tranlsation from cpumask to domain,
+	 * The translation procedure starts from the row of translation_table
+	 * the each entry of which includes the same number of CPUs as the
+	 * cpumask passed from flush_tlb_mm API.
+	 */
+	static int pos[NR_CPUS] = { 0 };
+	for (domain = 0; domain < NR_DOMAINS; domain++) {
+		int nr_cpus = cpumask_weight(&(domain_table[domain].cpumask));
+		cpumask_copy(&(translation_table[nr_cpus][pos[nr_cpus]].cpumask),
+			     &(domain_table[domain].cpumask));
+		translation_table[nr_cpus][pos[nr_cpus]].domain_id =
+				domain_table[domain].domain_id;
+		pos[nr_cpus] += 1;
+	}
+
+	return 0;
+}
+
+int __init tlbi_domain_hardcode_init(void)
+{
+	int cpu, domain, i;
+
+	// hardcode each cpu -> domains.
+	static int hardcode_cpu_to_domain[NR_CPUS][NR_DOMAINS] = {
+		// cluster 0
+		{1},
+		{1},
+		{1},
+		{1},
+		{1},
+		{1},
+		{1},
+		{1},
+		// cluster 1
+		{2},
+		{2},
+		{2},
+		{2},
+		{2},
+		{2},
+		{2},
+		{2},
+	};
+
+	for (domain = 1; domain < NR_DOMAINS; domain++) {
+		cpumask_clear(&(domain_table[domain].cpumask));
+	}
+
+	cpumask_copy(&(domain_table[0].cpumask), cpu_possible_mask);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		for (i = 0; i < NR_DOMAINS; ++i) {
+			domain = hardcode_cpu_to_domain[cpu][i];
+			if (domain)
+				cpumask_set_cpu(cpu, &(domain_table[domain].cpumask));
+		}
+	}
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		for (domain = 0; domain < NR_DOMAINS; domain++)
+			translation_table[cpu][domain].domain_id = -1;
+	}
+
+	/*
+	 * Map each domain_entry into translation_table. The purpose is ensure
+	 * all of domain_entries on the same row of translation_table include
+	 * the same number of CPUs.
+	 * More number of CPUs means higher overhead of 'dmb sy'. So in order
+	 * to reduce the latency spent on tranlsation from cpumask to domain,
+	 * The translation procedure starts from the row of translation_table
+	 * the each entry of which includes the same number of CPUs as the
+	 * cpumask passed from flush_tlb_mm API.
+	 */
+	static int pos[NR_CPUS] = { 0 };
+	for (domain = 0; domain < NR_DOMAINS; domain++) {
+		int nr_cpus = cpumask_weight(&(domain_table[domain].cpumask));
+		cpumask_copy(&(translation_table[nr_cpus][pos[nr_cpus]].cpumask),
+			     &(domain_table[domain].cpumask));
+		translation_table[nr_cpus][pos[nr_cpus]].domain_id =
+				domain_table[domain].domain_id;
+		pos[nr_cpus]++;
+	}
+
+
+	for (domain = 0; domain < NR_DOMAINS; ++domain) {
+		if (!cpumask_empty(&domain_table[domain].cpumask))
+			pr_info("domain:%d cpumask:  %*pbl\n", domain,
+				cpumask_pr_args(&domain_table[domain].cpumask));
+	}
+
+	return 0;
+}
+arch_initcall(tlbi_domain_hardcode_init);
-- 
2.25.1

    

[PATCH OLK-6.6 1/6] arm64: mm: hardcode domain info and add get_domain_cpumask()

Zeng Heng