From: Jinjiang Tu <tujinjiang@huawei.com> Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com> --- arch/arm64/include/asm/mmu_context.h | 3 + arch/arm64/mm/context.c | 215 +++++++++++++++++++++++++++ 2 files changed, 218 insertions(+) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a6fb325424e7..4c0e29a86ac2 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -298,6 +298,9 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) return -1UL >> 8; } +int get_domain_cpumask(int domain, cpumask_t *cpus); +int trans_cpumask_to_domain(const cpumask_t *active_cpus); + #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index b2ac06246327..fd6d093bbdff 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -420,3 +420,218 @@ static int asids_init(void) return 0; } early_initcall(asids_init); + +/* + * Several data structs and helpers of translation from cpumask to tlb domain + */ +#include <linux/bitmap.h> + +#define NR_DOMAINS (128) +#define CLUSTER_PER_DIE (6) +#define CPU_PER_CLUSTER (8) + +struct domain_entry { + /* fields passed via ACPI*/ + int socket0; + int socket1; + int die0; + int die1; + unsigned long cluster0_mask; + unsigned long cluster1_mask; + int domain_id; + /* fields initialized via kernel */ + cpumask_t cpumask; +}; +static struct domain_entry domain_table[NR_DOMAINS]; + +struct translation_entry { + cpumask_t cpumask; + int domain_id; +}; +static struct translation_entry translation_table[NR_CPUS][NR_DOMAINS]; + +int trans_cpumask_to_domain(const cpumask_t *active_cpus) +{ + struct translation_entry *entry; + cpumask_t cpumask; + int i, j; + + /* + * Due to the sparse nature of translation_table, the time + * complexity is NR_DOMAINS, not NR_DOMAINS * NR_CPUS indeed. + */ + for (i = max(CPU_PER_CLUSTER, cpumask_weight(active_cpus)); i < NR_CPUS; i++) { + for (j = 0; j < NR_DOMAINS; j++) { + entry = &translation_table[i][j]; + /* the last valid entry */ + if (entry->domain_id == -1) + break; + + if (cpumask_and(&cpumask, active_cpus, &(entry->cpumask)) && + cpumask_equal(&cpumask, active_cpus)) + return translation_table[i][j].domain_id; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(trans_cpumask_to_domain); + +int get_domain_cpumask(int domain, cpumask_t *cpus) +{ + cpumask_copy(cpus, &(domain_table[domain].cpumask)); + + return 0; +} +EXPORT_SYMBOL_GPL(get_domain_cpumask); + +void init_domain_entry(int cpu, int domain) +{ + int socket = cpu_topology[cpu].package_id; + int die = cpu_to_node(cpu); + unsigned long cluster = (unsigned long)cpu_topology[cpu].cluster_id; + unsigned long map; + + if (domain_table[domain].socket0 == socket) { + if (domain_table[domain].die0 == die) { + map = (1 << cluster); + if (bitmap_and(&map, &map, + &domain_table[domain].cluster0_mask, + CLUSTER_PER_DIE)) { + cpumask_set_cpu(cpu, + &domain_table[domain].cpumask); + return; + } + } + } + + if (domain_table[domain].socket1 == socket) { + if (domain_table[domain].die1 == die) { + map = (1 << cluster); + if (bitmap_and(&map, &map, + &domain_table[domain].cluster1_mask, + CLUSTER_PER_DIE)) { + cpumask_set_cpu(cpu, + &domain_table[domain].cpumask); + return; + } + } + } +} + +int __init called_after_parse_acpi_topology(void) +{ + int cpu, domain; + + for (domain = 0; domain < NR_DOMAINS; domain++) { + cpumask_clear(&(domain_table[domain].cpumask)); + } + /* + * Initialize the reserved fields of domain_entry for the domain + * translation later. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (domain = 0; domain < NR_DOMAINS; domain++) { + init_domain_entry(cpu, domain); + translation_table[cpu][domain].domain_id = -1; + } + } + + /* + * Map each domain_entry into translation_table. The purpose is ensure + * all of domain_entries on the same row of translation_table include + * the same number of CPUs. + * More number of CPUs means higher overhead of 'dmb sy'. So in order + * to reduce the latency spent on tranlsation from cpumask to domain, + * The translation procedure starts from the row of translation_table + * the each entry of which includes the same number of CPUs as the + * cpumask passed from flush_tlb_mm API. + */ + static int pos[NR_CPUS] = { 0 }; + for (domain = 0; domain < NR_DOMAINS; domain++) { + int nr_cpus = cpumask_weight(&(domain_table[domain].cpumask)); + cpumask_copy(&(translation_table[nr_cpus][pos[nr_cpus]].cpumask), + &(domain_table[domain].cpumask)); + translation_table[nr_cpus][pos[nr_cpus]].domain_id = + domain_table[domain].domain_id; + pos[nr_cpus] += 1; + } + + return 0; +} + +int __init tlbi_domain_hardcode_init(void) +{ + int cpu, domain, i; + + // hardcode each cpu -> domains. + static int hardcode_cpu_to_domain[NR_CPUS][NR_DOMAINS] = { + // cluster 0 + {1}, + {1}, + {1}, + {1}, + {1}, + {1}, + {1}, + {1}, + // cluster 1 + {2}, + {2}, + {2}, + {2}, + {2}, + {2}, + {2}, + {2}, + }; + + for (domain = 1; domain < NR_DOMAINS; domain++) { + cpumask_clear(&(domain_table[domain].cpumask)); + } + + cpumask_copy(&(domain_table[0].cpumask), cpu_possible_mask); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (i = 0; i < NR_DOMAINS; ++i) { + domain = hardcode_cpu_to_domain[cpu][i]; + if (domain) + cpumask_set_cpu(cpu, &(domain_table[domain].cpumask)); + } + } + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (domain = 0; domain < NR_DOMAINS; domain++) + translation_table[cpu][domain].domain_id = -1; + } + + /* + * Map each domain_entry into translation_table. The purpose is ensure + * all of domain_entries on the same row of translation_table include + * the same number of CPUs. + * More number of CPUs means higher overhead of 'dmb sy'. So in order + * to reduce the latency spent on tranlsation from cpumask to domain, + * The translation procedure starts from the row of translation_table + * the each entry of which includes the same number of CPUs as the + * cpumask passed from flush_tlb_mm API. + */ + static int pos[NR_CPUS] = { 0 }; + for (domain = 0; domain < NR_DOMAINS; domain++) { + int nr_cpus = cpumask_weight(&(domain_table[domain].cpumask)); + cpumask_copy(&(translation_table[nr_cpus][pos[nr_cpus]].cpumask), + &(domain_table[domain].cpumask)); + translation_table[nr_cpus][pos[nr_cpus]].domain_id = + domain_table[domain].domain_id; + pos[nr_cpus]++; + } + + + for (domain = 0; domain < NR_DOMAINS; ++domain) { + if (!cpumask_empty(&domain_table[domain].cpumask)) + pr_info("domain:%d cpumask: %*pbl\n", domain, + cpumask_pr_args(&domain_table[domain].cpumask)); + } + + return 0; +} +arch_initcall(tlbi_domain_hardcode_init); -- 2.25.1