From: Jinjiang Tu <tujinjiang@huawei.com> This patch optimizes flush_tlb_mm() by using TLBID when available: - Local invalidation: When the mm is only active on the current CPU, use non-shareable TLB invalidation (aside1) for better performance. - Domain-based invalidation: When the mm is active on multiple CPUs but not all, use TLBID to target only the relevant CPUs, reducing cache coherency traffic compared to full broadcast. - Broadcast fallback: When TLBID is not supported or when targeting all CPUs, maintain the existing behavior (aside1is). The scope detection logic (flush_tlb_user_scope()) determines the optimal invalidation strategy based on mm_cpumask() and TLBID capabilities. This optimization is particularly beneficial for: - Workloads with many short-lived processes - Systems with high CPU counts where broadcast TLB shootdowns are costly - Scenarios where process memory is mostly local to a subset of CPUs Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com> Signed-off-by: Zeng Heng <zengheng4@huawei.com> --- arch/arm64/include/asm/tlbflush.h | 74 +++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5fa6c3d0d63c..723e75ac1ea2 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -63,6 +63,23 @@ __ta; \ }) +#ifdef CONFIG_ARM64_TLBID +/* This macro creates a properly formatted VA operand for the TLBID */ +#define __TLBI_DOMAIN(asid, domain) \ + ({ \ + unsigned long __ta = (domain); \ + __ta &= GENMASK_ULL(15, 0); \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ + }) +#else +#define __TLBI_DOMAIN(asid, domain) \ + ( \ + (void)(domain), \ + __TLBI_VADDR(0, (asid)) \ + ) +#endif + /* * Get translation granule of the system, which is decided by * PAGE_SIZE. Used by TTL. @@ -165,6 +182,41 @@ static inline unsigned long get_trans_granule(void) (__pages >> (5 * (scale) + 1)) - 1; \ }) +enum tlb_flush_scope { + TLB_FLUSH_SCOPE_LOCAL, + TLB_FLUSH_SCOPE_MULTICAST, + TLB_FLUSH_SCOPE_BROADCAST, +}; + +#ifdef CONFIG_ARM64_TLBID +/* + * Determines whether the user tlbi invalidation can be performed only on the + * local CPU or whether it needs to be multicast or broadcast. + */ +static inline enum tlb_flush_scope flush_tlb_user_scope(struct mm_struct *mm) +{ + const struct cpumask *cmask = mm_cpumask(mm); + unsigned int cpu; + + if (!system_supports_tlbid()) + return TLB_FLUSH_SCOPE_BROADCAST; + + cpu = smp_processor_id(); + WARN_ON(mm == &init_mm); + + /* check if the tlbflush needs to be sent to other CPUs */ + if (cpumask_any_but(cmask, cpu) >= nr_cpu_ids) + return TLB_FLUSH_SCOPE_LOCAL; + + return TLB_FLUSH_SCOPE_MULTICAST; +} +#else +static inline enum tlb_flush_scope flush_tlb_user_scope(struct mm_struct *mm) +{ + return TLB_FLUSH_SCOPE_BROADCAST; +} +#endif + /* * TLB Invalidation * ================ @@ -250,14 +302,30 @@ static inline void flush_tlb_all(void) isb(); } +int trans_cpumask_to_domain(const cpumask_t *active_cpus); + static inline void flush_tlb_mm(struct mm_struct *mm) { + enum tlb_flush_scope scope; unsigned long asid; + int domain; dsb(ishst); - asid = __TLBI_VADDR(0, ASID(mm)); - __tlbi(aside1is, asid); - __tlbi_user(aside1is, asid); + scope = flush_tlb_user_scope(mm); + if (scope == TLB_FLUSH_SCOPE_LOCAL) { + asid = __TLBI_VADDR(0, ASID(mm)); + __tlbi(aside1, asid); + __tlbi_user(aside1, asid); + } else if (scope == TLB_FLUSH_SCOPE_BROADCAST) { + asid = __TLBI_VADDR(0, ASID(mm)); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + } else { + domain = trans_cpumask_to_domain(mm_cpumask(mm)); + asid = __TLBI_DOMAIN(ASID(mm), domain); + __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); + } dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } -- 2.25.1