[PATCH OLK-6.6 0/6] arm64: tlbflush: Optimize flush_tlb_mm() by using TLBID
Jinjiang Tu (3): arm64: mm: hardcode domain info and add get_domain_cpumask() arm64: mm: Track CPUs that a task has run on for TLBID optimization arm64: tlbflush: Optimize flush_tlb_mm() by using TLBID Marc Zyngier (2): arm64: cpufeature: Add ID_AA64MMFR4_EL1 handling arm64: sysreg: Add layout for ID_AA64MMFR4_EL1 Zeng Heng (1): arm64: cpufeature: Add TLBID (Domain-based TLB Invalidation) detection arch/arm64/Kconfig | 12 ++ arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/cpufeature.h | 6 + arch/arm64/include/asm/mmu_context.h | 3 + arch/arm64/include/asm/tlbflush.h | 74 ++++++++- arch/arm64/kernel/cpufeature.c | 17 ++ arch/arm64/kernel/cpuinfo.c | 1 + arch/arm64/mm/context.c | 224 ++++++++++++++++++++++++++- arch/arm64/tools/cpucaps | 1 + arch/arm64/tools/sysreg | 41 +++++ 10 files changed, 375 insertions(+), 5 deletions(-) -- 2.25.1
From: Jinjiang Tu <tujinjiang@huawei.com> Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com> --- arch/arm64/include/asm/mmu_context.h | 3 + arch/arm64/mm/context.c | 215 +++++++++++++++++++++++++++ 2 files changed, 218 insertions(+) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a6fb325424e7..4c0e29a86ac2 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -298,6 +298,9 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) return -1UL >> 8; } +int get_domain_cpumask(int domain, cpumask_t *cpus); +int trans_cpumask_to_domain(const cpumask_t *active_cpus); + #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index b2ac06246327..fd6d093bbdff 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -420,3 +420,218 @@ static int asids_init(void) return 0; } early_initcall(asids_init); + +/* + * Several data structs and helpers of translation from cpumask to tlb domain + */ +#include <linux/bitmap.h> + +#define NR_DOMAINS (128) +#define CLUSTER_PER_DIE (6) +#define CPU_PER_CLUSTER (8) + +struct domain_entry { + /* fields passed via ACPI*/ + int socket0; + int socket1; + int die0; + int die1; + unsigned long cluster0_mask; + unsigned long cluster1_mask; + int domain_id; + /* fields initialized via kernel */ + cpumask_t cpumask; +}; +static struct domain_entry domain_table[NR_DOMAINS]; + +struct translation_entry { + cpumask_t cpumask; + int domain_id; +}; +static struct translation_entry translation_table[NR_CPUS][NR_DOMAINS]; + +int trans_cpumask_to_domain(const cpumask_t *active_cpus) +{ + struct translation_entry *entry; + cpumask_t cpumask; + int i, j; + + /* + * Due to the sparse nature of translation_table, the time + * complexity is NR_DOMAINS, not NR_DOMAINS * NR_CPUS indeed. + */ + for (i = max(CPU_PER_CLUSTER, cpumask_weight(active_cpus)); i < NR_CPUS; i++) { + for (j = 0; j < NR_DOMAINS; j++) { + entry = &translation_table[i][j]; + /* the last valid entry */ + if (entry->domain_id == -1) + break; + + if (cpumask_and(&cpumask, active_cpus, &(entry->cpumask)) && + cpumask_equal(&cpumask, active_cpus)) + return translation_table[i][j].domain_id; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(trans_cpumask_to_domain); + +int get_domain_cpumask(int domain, cpumask_t *cpus) +{ + cpumask_copy(cpus, &(domain_table[domain].cpumask)); + + return 0; +} +EXPORT_SYMBOL_GPL(get_domain_cpumask); + +void init_domain_entry(int cpu, int domain) +{ + int socket = cpu_topology[cpu].package_id; + int die = cpu_to_node(cpu); + unsigned long cluster = (unsigned long)cpu_topology[cpu].cluster_id; + unsigned long map; + + if (domain_table[domain].socket0 == socket) { + if (domain_table[domain].die0 == die) { + map = (1 << cluster); + if (bitmap_and(&map, &map, + &domain_table[domain].cluster0_mask, + CLUSTER_PER_DIE)) { + cpumask_set_cpu(cpu, + &domain_table[domain].cpumask); + return; + } + } + } + + if (domain_table[domain].socket1 == socket) { + if (domain_table[domain].die1 == die) { + map = (1 << cluster); + if (bitmap_and(&map, &map, + &domain_table[domain].cluster1_mask, + CLUSTER_PER_DIE)) { + cpumask_set_cpu(cpu, + &domain_table[domain].cpumask); + return; + } + } + } +} + +int __init called_after_parse_acpi_topology(void) +{ + int cpu, domain; + + for (domain = 0; domain < NR_DOMAINS; domain++) { + cpumask_clear(&(domain_table[domain].cpumask)); + } + /* + * Initialize the reserved fields of domain_entry for the domain + * translation later. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (domain = 0; domain < NR_DOMAINS; domain++) { + init_domain_entry(cpu, domain); + translation_table[cpu][domain].domain_id = -1; + } + } + + /* + * Map each domain_entry into translation_table. The purpose is ensure + * all of domain_entries on the same row of translation_table include + * the same number of CPUs. + * More number of CPUs means higher overhead of 'dmb sy'. So in order + * to reduce the latency spent on tranlsation from cpumask to domain, + * The translation procedure starts from the row of translation_table + * the each entry of which includes the same number of CPUs as the + * cpumask passed from flush_tlb_mm API. + */ + static int pos[NR_CPUS] = { 0 }; + for (domain = 0; domain < NR_DOMAINS; domain++) { + int nr_cpus = cpumask_weight(&(domain_table[domain].cpumask)); + cpumask_copy(&(translation_table[nr_cpus][pos[nr_cpus]].cpumask), + &(domain_table[domain].cpumask)); + translation_table[nr_cpus][pos[nr_cpus]].domain_id = + domain_table[domain].domain_id; + pos[nr_cpus] += 1; + } + + return 0; +} + +int __init tlbi_domain_hardcode_init(void) +{ + int cpu, domain, i; + + // hardcode each cpu -> domains. + static int hardcode_cpu_to_domain[NR_CPUS][NR_DOMAINS] = { + // cluster 0 + {1}, + {1}, + {1}, + {1}, + {1}, + {1}, + {1}, + {1}, + // cluster 1 + {2}, + {2}, + {2}, + {2}, + {2}, + {2}, + {2}, + {2}, + }; + + for (domain = 1; domain < NR_DOMAINS; domain++) { + cpumask_clear(&(domain_table[domain].cpumask)); + } + + cpumask_copy(&(domain_table[0].cpumask), cpu_possible_mask); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (i = 0; i < NR_DOMAINS; ++i) { + domain = hardcode_cpu_to_domain[cpu][i]; + if (domain) + cpumask_set_cpu(cpu, &(domain_table[domain].cpumask)); + } + } + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (domain = 0; domain < NR_DOMAINS; domain++) + translation_table[cpu][domain].domain_id = -1; + } + + /* + * Map each domain_entry into translation_table. The purpose is ensure + * all of domain_entries on the same row of translation_table include + * the same number of CPUs. + * More number of CPUs means higher overhead of 'dmb sy'. So in order + * to reduce the latency spent on tranlsation from cpumask to domain, + * The translation procedure starts from the row of translation_table + * the each entry of which includes the same number of CPUs as the + * cpumask passed from flush_tlb_mm API. + */ + static int pos[NR_CPUS] = { 0 }; + for (domain = 0; domain < NR_DOMAINS; domain++) { + int nr_cpus = cpumask_weight(&(domain_table[domain].cpumask)); + cpumask_copy(&(translation_table[nr_cpus][pos[nr_cpus]].cpumask), + &(domain_table[domain].cpumask)); + translation_table[nr_cpus][pos[nr_cpus]].domain_id = + domain_table[domain].domain_id; + pos[nr_cpus]++; + } + + + for (domain = 0; domain < NR_DOMAINS; ++domain) { + if (!cpumask_empty(&domain_table[domain].cpumask)) + pr_info("domain:%d cpumask: %*pbl\n", domain, + cpumask_pr_args(&domain_table[domain].cpumask)); + } + + return 0; +} +arch_initcall(tlbi_domain_hardcode_init); -- 2.25.1
From: Jinjiang Tu <tujinjiang@huawei.com> To utilize TLBID feature effectively, we need to know which CPUs a given mm (address space) has been active on. This patch implements cumulative CPU tracking for each mm: - On new ASID allocation (new_context()): Clear the cpumask to start fresh. This handles both new processes and ASID generation wrap-around cases. - On context switch (check_and_switch_context()): Set the current CPU in mm_cpumask(mm) if TLBID is supported. The tracking is cumulative (CPUs are never cleared except on ASID re-allocation). While this may include CPUs where the task no longer runs, TLB invalidation to a superset remains functionally correct. This infrastructure enables subsequent flush_tlb_mm() can use domain-based invalidation instead of full broadcast when the mm's CPU footprint is limited. Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com> Signed-off-by: Zeng Heng <zengheng4@huawei.com> --- arch/arm64/mm/context.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index fd6d093bbdff..9be3ff8f8106 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -207,6 +207,9 @@ static u64 new_context(struct mm_struct *mm) asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1); set_asid: + if (system_supports_tlbid()) + cpumask_clear(mm_cpumask(mm)); + __set_bit(asid, asid_map); cur_idx = asid; return asid2ctxid(asid, generation); @@ -215,8 +218,8 @@ static u64 new_context(struct mm_struct *mm) void check_and_switch_context(struct mm_struct *mm) { unsigned long flags; - unsigned int cpu; u64 asid, old_active_asid; + unsigned int cpu = smp_processor_id(); if (system_supports_cnp()) cpu_set_reserved_ttbr0(); @@ -251,7 +254,6 @@ void check_and_switch_context(struct mm_struct *mm) atomic64_set(&mm->context.id, asid); } - cpu = smp_processor_id(); if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) local_flush_tlb_all(); @@ -262,6 +264,9 @@ void check_and_switch_context(struct mm_struct *mm) arm64_apply_bp_hardening(); + if (system_supports_tlbid()) + cpumask_set_cpu(cpu, mm_cpumask(mm)); + /* * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when * emulating PAN. -- 2.25.1
From: Jinjiang Tu <tujinjiang@huawei.com> This patch optimizes flush_tlb_mm() by using TLBID when available: - Local invalidation: When the mm is only active on the current CPU, use non-shareable TLB invalidation (aside1) for better performance. - Domain-based invalidation: When the mm is active on multiple CPUs but not all, use TLBID to target only the relevant CPUs, reducing cache coherency traffic compared to full broadcast. - Broadcast fallback: When TLBID is not supported or when targeting all CPUs, maintain the existing behavior (aside1is). The scope detection logic (flush_tlb_user_scope()) determines the optimal invalidation strategy based on mm_cpumask() and TLBID capabilities. This optimization is particularly beneficial for: - Workloads with many short-lived processes - Systems with high CPU counts where broadcast TLB shootdowns are costly - Scenarios where process memory is mostly local to a subset of CPUs Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com> Signed-off-by: Zeng Heng <zengheng4@huawei.com> --- arch/arm64/include/asm/tlbflush.h | 74 +++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5fa6c3d0d63c..723e75ac1ea2 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -63,6 +63,23 @@ __ta; \ }) +#ifdef CONFIG_ARM64_TLBID +/* This macro creates a properly formatted VA operand for the TLBID */ +#define __TLBI_DOMAIN(asid, domain) \ + ({ \ + unsigned long __ta = (domain); \ + __ta &= GENMASK_ULL(15, 0); \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ + }) +#else +#define __TLBI_DOMAIN(asid, domain) \ + ( \ + (void)(domain), \ + __TLBI_VADDR(0, (asid)) \ + ) +#endif + /* * Get translation granule of the system, which is decided by * PAGE_SIZE. Used by TTL. @@ -165,6 +182,41 @@ static inline unsigned long get_trans_granule(void) (__pages >> (5 * (scale) + 1)) - 1; \ }) +enum tlb_flush_scope { + TLB_FLUSH_SCOPE_LOCAL, + TLB_FLUSH_SCOPE_MULTICAST, + TLB_FLUSH_SCOPE_BROADCAST, +}; + +#ifdef CONFIG_ARM64_TLBID +/* + * Determines whether the user tlbi invalidation can be performed only on the + * local CPU or whether it needs to be multicast or broadcast. + */ +static inline enum tlb_flush_scope flush_tlb_user_scope(struct mm_struct *mm) +{ + const struct cpumask *cmask = mm_cpumask(mm); + unsigned int cpu; + + if (!system_supports_tlbid()) + return TLB_FLUSH_SCOPE_BROADCAST; + + cpu = smp_processor_id(); + WARN_ON(mm == &init_mm); + + /* check if the tlbflush needs to be sent to other CPUs */ + if (cpumask_any_but(cmask, cpu) >= nr_cpu_ids) + return TLB_FLUSH_SCOPE_LOCAL; + + return TLB_FLUSH_SCOPE_MULTICAST; +} +#else +static inline enum tlb_flush_scope flush_tlb_user_scope(struct mm_struct *mm) +{ + return TLB_FLUSH_SCOPE_BROADCAST; +} +#endif + /* * TLB Invalidation * ================ @@ -250,14 +302,30 @@ static inline void flush_tlb_all(void) isb(); } +int trans_cpumask_to_domain(const cpumask_t *active_cpus); + static inline void flush_tlb_mm(struct mm_struct *mm) { + enum tlb_flush_scope scope; unsigned long asid; + int domain; dsb(ishst); - asid = __TLBI_VADDR(0, ASID(mm)); - __tlbi(aside1is, asid); - __tlbi_user(aside1is, asid); + scope = flush_tlb_user_scope(mm); + if (scope == TLB_FLUSH_SCOPE_LOCAL) { + asid = __TLBI_VADDR(0, ASID(mm)); + __tlbi(aside1, asid); + __tlbi_user(aside1, asid); + } else if (scope == TLB_FLUSH_SCOPE_BROADCAST) { + asid = __TLBI_VADDR(0, ASID(mm)); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + } else { + domain = trans_cpumask_to_domain(mm_cpumask(mm)); + asid = __TLBI_DOMAIN(ASID(mm), domain); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + } dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } -- 2.25.1
On 4/10/2026 9:23 AM, Zeng Heng wrote:
From: Jinjiang Tu <tujinjiang@huawei.com>
This patch optimizes flush_tlb_mm() by using TLBID when available:
- Local invalidation: When the mm is only active on the current CPU, use non-shareable TLB invalidation (aside1) for better performance.
- Domain-based invalidation: When the mm is active on multiple CPUs but not all, use TLBID to target only the relevant CPUs, reducing cache coherency traffic compared to full broadcast.
- Broadcast fallback: When TLBID is not supported or when targeting all CPUs, maintain the existing behavior (aside1is).
The scope detection logic (flush_tlb_user_scope()) determines the optimal invalidation strategy based on mm_cpumask() and TLBID capabilities.
This optimization is particularly beneficial for: - Workloads with many short-lived processes - Systems with high CPU counts where broadcast TLB shootdowns are costly - Scenarios where process memory is mostly local to a subset of CPUs
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com> Signed-off-by: Zeng Heng <zengheng4@huawei.com> --- arch/arm64/include/asm/tlbflush.h | 74 +++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5fa6c3d0d63c..723e75ac1ea2 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -63,6 +63,23 @@ __ta; \ })
+#ifdef CONFIG_ARM64_TLBID +/* This macro creates a properly formatted VA operand for the TLBID */ +#define __TLBI_DOMAIN(asid, domain) \ + ({ \ + unsigned long __ta = (domain); \ + __ta &= GENMASK_ULL(15, 0); \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ + }) +#else +#define __TLBI_DOMAIN(asid, domain) \ + ( \ + (void)(domain), \ + __TLBI_VADDR(0, (asid)) \ + ) +#endif + 这里为啥不放到下面CONFIG_ARM64_TLBID内;
/* * Get translation granule of the system, which is decided by * PAGE_SIZE. Used by TTL. @@ -165,6 +182,41 @@ static inline unsigned long get_trans_granule(void) (__pages >> (5 * (scale) + 1)) - 1; \ })
+enum tlb_flush_scope { + TLB_FLUSH_SCOPE_LOCAL, + TLB_FLUSH_SCOPE_MULTICAST, + TLB_FLUSH_SCOPE_BROADCAST, +}; + +#ifdef CONFIG_ARM64_TLBID +/* + * Determines whether the user tlbi invalidation can be performed only on the + * local CPU or whether it needs to be multicast or broadcast. + */ +static inline enum tlb_flush_scope flush_tlb_user_scope(struct mm_struct *mm) +{ + const struct cpumask *cmask = mm_cpumask(mm); + unsigned int cpu; + + if (!system_supports_tlbid()) + return TLB_FLUSH_SCOPE_BROADCAST; + + cpu = smp_processor_id(); + WARN_ON(mm == &init_mm); + + /* check if the tlbflush needs to be sent to other CPUs */ + if (cpumask_any_but(cmask, cpu) >= nr_cpu_ids) + return TLB_FLUSH_SCOPE_LOCAL; + + return TLB_FLUSH_SCOPE_MULTICAST; +} +#else +static inline enum tlb_flush_scope flush_tlb_user_scope(struct mm_struct *mm) +{ + return TLB_FLUSH_SCOPE_BROADCAST; +} +#endif + /* * TLB Invalidation * ================ @@ -250,14 +302,30 @@ static inline void flush_tlb_all(void) isb(); }
+int trans_cpumask_to_domain(const cpumask_t *active_cpus);
声明放到这里也很奇怪
static inline void flush_tlb_mm(struct mm_struct *mm) { + enum tlb_flush_scope scope; unsigned long asid; + int domain;
dsb(ishst); - asid = __TLBI_VADDR(0, ASID(mm)); - __tlbi(aside1is, asid); - __tlbi_user(aside1is, asid); + scope = flush_tlb_user_scope(mm); + if (scope == TLB_FLUSH_SCOPE_LOCAL) { + asid = __TLBI_VADDR(0, ASID(mm)); + __tlbi(aside1, asid); + __tlbi_user(aside1, asid); + } else if (scope == TLB_FLUSH_SCOPE_BROADCAST) { + asid = __TLBI_VADDR(0, ASID(mm)); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + } else { + domain = trans_cpumask_to_domain(mm_cpumask(mm)); + asid = __TLBI_DOMAIN(ASID(mm), domain); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + } dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); }
From: Marc Zyngier <maz@kernel.org> Add ID_AA64MMFR4_EL1 to the list of idregs the kernel knows about, and describe the E2H0 field. Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com> Signed-off-by: Marc Zyngier <maz@kernel.org> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Link: https://lore.kernel.org/r/20240122181344.258974-6-maz@kernel.org Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Conflicts: arch/arm64/kernel/cpufeature.c [Fix context conflicts.] Signed-off-by: Zeng Heng <zengheng4@huawei.com> --- arch/arm64/include/asm/cpu.h | 1 + arch/arm64/kernel/cpufeature.c | 7 +++++++ arch/arm64/kernel/cpuinfo.c | 1 + 3 files changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 3101d7280d1c..366c2c3dc34c 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -57,6 +57,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64mmfr1; u64 reg_id_aa64mmfr2; u64 reg_id_aa64mmfr3; + u64 reg_id_aa64mmfr4; u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; u64 reg_id_aa64zfr0; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4492eaa4f4b8..a65285802864 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -413,6 +413,11 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), @@ -756,6 +761,7 @@ static const struct __ftr_reg_entry { &id_aa64mmfr1_override), ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), + ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), /* Op1 = 0, CRn = 1, CRm = 2 */ ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), @@ -1055,6 +1061,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); + init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 4b1da93dd740..6fedd3e240bf 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -325,6 +325,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1); + info->reg_id_aa64mmfr4 = read_cpuid(ID_AA64MMFR4_EL1); info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); -- 2.25.1
From: Marc Zyngier <maz@kernel.org> ARMv9.5 has infroduced ID_AA64MMFR4_EL1 with a bunch of new features. Add the corresponding layout. This is extracted from the public ARM SysReg_xml_A_profile-2023-09 delivery, timestamped d55f5af8e09052abe92a02adf820deea2eaed717. Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com> Signed-off-by: Marc Zyngier <maz@kernel.org> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Reviewed-by: Miguel Luis <miguel.luis@oracle.com> Link: https://lore.kernel.org/r/20240122181344.258974-5-maz@kernel.org Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Zeng Heng <zengheng4@huawei.com> --- arch/arm64/tools/sysreg | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 963177485fd8..618fbc647c04 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1804,6 +1804,43 @@ UnsignedEnum 3:0 TCRX EndEnum EndSysreg +Sysreg ID_AA64MMFR4_EL1 3 0 0 7 4 +Res0 63:40 +UnsignedEnum 39:36 E3DSE + 0b0000 NI + 0b0001 IMP +EndEnum +Res0 35:28 +SignedEnum 27:24 E2H0 + 0b0000 IMP + 0b1110 NI_NV1 + 0b1111 NI +EndEnum +UnsignedEnum 23:20 NV_frac + 0b0000 NV_NV2 + 0b0001 NV2_ONLY +EndEnum +UnsignedEnum 19:16 FGWTE3 + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 15:12 HACDBS + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 11:8 ASID2 + 0b0000 NI + 0b0001 IMP +EndEnum +SignedEnum 7:4 EIESB + 0b0000 NI + 0b0001 ToEL3 + 0b0010 ToELx + 0b1111 ANY +EndEnum +Res0 3:0 +EndSysreg + Sysreg SCTLR_EL1 3 0 1 0 0 Field 63 TIDCP Field 62 SPINTMASK -- 2.25.1
Add detection for the ARM64 Domain-based TLB Invalidation (TLBID) feature as defined in ARMv9.3-A architecture. TLBID allows TLB invalidation operations to be scoped to a specific domain, avoiding the need for global TLB invalidation broadcasts across the system. This reduces cache coherency traffic and improves performance in virtualization scenarios. Signed-off-by: Zeng Heng <zengheng4@huawei.com> --- arch/arm64/Kconfig | 12 ++++++++++++ arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/kernel/cpufeature.c | 10 ++++++++++ arch/arm64/tools/cpucaps | 1 + arch/arm64/tools/sysreg | 6 +++++- 5 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 74e4639776de..7a7f67435d4f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2423,6 +2423,18 @@ config ARM64_HAFT endmenu # "ARMv8.8 architectural features" +menu "ARMv9.3 architectural features" + +config ARM64_TLBID + bool "Domain based ARM64 TLB invalidation" + default y + help + TLBI broadcasting all PEs introduces performance noise. By combining + hardware and software, TLBID (TLBI Domain) limit TLBI to an appropriate + scope, avoiding the performance overhead caused by broadcasting. + +endmenu # "ARMv9.3 architectural features" + menu "ARMv9.5 architectural features" config ARM64_HDBSS diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 6f73a51d2422..ede803987e11 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -887,6 +887,12 @@ static inline bool system_supports_tlb_range(void) cpus_have_const_cap(ARM64_HAS_TLB_RANGE); } +static inline bool system_supports_tlbid(void) +{ + return IS_ENABLED(CONFIG_ARM64_TLBID) && + cpus_have_const_cap(ARM64_HAS_TLBID); +} + static inline bool cpus_support_mpam(void) { return IS_ENABLED(CONFIG_ARM64_MPAM) && diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a65285802864..661e73ffab89 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -414,6 +414,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_TLBID_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -3161,6 +3162,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_arch_xcall_xint_support, .cpu_enable = cpu_enable_arch_xcall_xint, }, +#endif +#ifdef CONFIG_ARM64_TLBID + { + .desc = "Domain based ARM64 TLB invalidation", + .capability = ARM64_HAS_TLBID, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, TLBID, IMP) + }, #endif {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index d8f2db273def..16e8e66ad913 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -50,6 +50,7 @@ HAS_STAGE2_FWB HAS_TCR2 HAS_TIDCP1 HAS_TLB_RANGE +HAS_TLBID HAS_TWED HAS_VIRT_HOST_EXTN HAS_WFXT diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 618fbc647c04..2a5f26b4c960 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1805,7 +1805,11 @@ EndEnum EndSysreg Sysreg ID_AA64MMFR4_EL1 3 0 0 7 4 -Res0 63:40 +Res0 63:44 +UnsignedEnum 43:40 TLBID + 0b0000 NI + 0b0001 IMP +EndEnum UnsignedEnum 39:36 E3DSE 0b0000 NI 0b0001 IMP -- 2.25.1
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/21697 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/7JK... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/21697 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/7JK...
participants (3)
-
Kefeng Wang -
patchwork bot -
Zeng Heng